-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RSDK-9593] move ota to config monitor, config monitor restart hook #369
Changes from 13 commits
f770c97
a9c61b6
26f1f6e
986e3a3
d8e2ee6
acb1e5e
adb66d4
c2eebd8
3600bff
b003612
9caf29c
42fceb5
e63c84b
c7b1b2e
532233a
7444b59
1a6bae6
a17c32c
132a455
b23714c
6f9685e
2c233f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,13 +8,16 @@ use crate::{ | |
}; | ||
use async_io::Timer; | ||
use futures_lite::{Future, FutureExt}; | ||
use std::fmt::Debug; | ||
use std::pin::Pin; | ||
use std::time::Duration; | ||
use std::{fmt::Debug, pin::Pin, time::Duration}; | ||
|
||
#[cfg(feature = "ota")] | ||
use crate::common::{exec::Executor, ota}; | ||
|
||
pub struct ConfigMonitor<'a, Storage> { | ||
curr_config: Box<RobotConfig>, //config for robot gotten from last robot startup, aka inputted from entry | ||
storage: Storage, | ||
#[cfg(feature = "ota")] | ||
executor: Executor, | ||
restart_hook: Box<dyn Fn() + 'a>, | ||
} | ||
|
||
|
@@ -27,11 +30,14 @@ where | |
pub fn new( | ||
curr_config: Box<RobotConfig>, | ||
storage: Storage, | ||
#[cfg(feature = "ota")] executor: Executor, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: not ideal to have function parameters be dependent on features There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if either a) we decide to revert back to a separate ota task or b) if OTA becomes default and we remove the feagure-gate, we could remove this. |
||
restart_hook: impl Fn() + 'a, | ||
) -> Self { | ||
Self { | ||
curr_config, | ||
storage, | ||
#[cfg(feature = "ota")] | ||
executor, | ||
restart_hook: Box::new(restart_hook), | ||
} | ||
} | ||
|
@@ -71,16 +77,47 @@ where | |
}) | ||
.await?; | ||
|
||
if new_config | ||
.config | ||
.is_some_and(|cfg| cfg != *self.curr_config) | ||
{ | ||
if let Err(e) = self.storage.reset_robot_configuration() { | ||
log::warn!( | ||
"Failed to reset robot config after new config detected: {}", | ||
e | ||
); | ||
} else { | ||
if let Some(config) = new_config.as_ref().config.as_ref() { | ||
let mut reboot = false; | ||
|
||
#[cfg(feature = "ota")] | ||
{ | ||
if let Some(service) = config | ||
.services | ||
.iter() | ||
.find(|&service| service.model == *ota::OTA_MODEL_TRIPLET) | ||
{ | ||
match ota::OtaService::from_config( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We end up instantiating a new OtaService and doing a version check each time the ConfigMonitor triggers that's not super ideal There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. made a ticket, though may be a non-issue if we revert back to separate ota task after testing. RSDK-9676 |
||
service, | ||
self.storage.clone(), | ||
self.executor.clone(), | ||
) { | ||
Ok(mut ota) => match ota.update().await { | ||
Ok(needs_reboot) => reboot = needs_reboot, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC at reboot the config retrieved offline overwrites the stored config, is there a particular reason for the ota to not reboot immediately on successful download? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe not a likely scenario, but if app.viam becomes unreachable during the download period, at least it would be operating off of the new config at reboot. we can iterate on it |
||
Err(e) => log::error!("failed to complete ota update: {}", e), | ||
}, | ||
Err(e) => log::error!( | ||
"failed to create ota service from config:{} - {:?}", | ||
e, | ||
service, | ||
), | ||
} | ||
} | ||
} | ||
|
||
if *config != *self.curr_config { | ||
if let Err(e) = self.storage.reset_robot_configuration() { | ||
log::warn!( | ||
"Failed to reset robot config after new config detected: {}", | ||
e | ||
); | ||
} else { | ||
reboot = true; | ||
} | ||
} | ||
|
||
if reboot { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks cleaner to use a flag instead of fiddling with feature gates |
||
// TODO(RSDK-9464): flush logs to app.viam before restarting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This ticket would be another reason to DRY the shutdown handling. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as in, have a global shutdown function that handles log-flushing? |
||
self.restart(); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,7 +51,7 @@ use super::server::{IncomingConnectionManager, WebRtcConfiguration}; | |
use crate::common::provisioning::server::AsNetwork; | ||
|
||
#[cfg(feature = "ota")] | ||
use crate::common::{credentials_storage::OtaMetadataStorage, ota}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've left this here for now instead of shifting it to |
||
use crate::common::credentials_storage::OtaMetadataStorage; | ||
|
||
pub struct RobotCloudConfig { | ||
local_fqdn: String, | ||
|
@@ -364,8 +364,6 @@ where | |
http2_server_port: self.http2_server_port, | ||
wifi_manager: self.wifi_manager.into(), | ||
app_client_tasks: self.app_client_tasks, | ||
#[cfg(feature = "ota")] | ||
ota_service_task: Default::default(), | ||
max_concurrent_connections: self.max_concurrent_connections, | ||
network: Some(network), | ||
} | ||
|
@@ -400,8 +398,6 @@ where | |
http2_server_port: self.http2_server_port, | ||
wifi_manager: Rc::new(self.wifi_manager), | ||
app_client_tasks: self.app_client_tasks, | ||
#[cfg(feature = "ota")] | ||
ota_service_task: None, | ||
max_concurrent_connections: self.max_concurrent_connections, | ||
network: None, | ||
} | ||
|
@@ -420,8 +416,6 @@ pub struct ViamServer<Storage, C, M> { | |
http2_server_port: u16, | ||
wifi_manager: Rc<Option<Box<dyn WifiManager>>>, | ||
app_client_tasks: Vec<Box<dyn PeriodicAppClientTask>>, | ||
#[cfg(feature = "ota")] | ||
ota_service_task: Option<Task<()>>, | ||
max_concurrent_connections: usize, | ||
network: Option<Box<dyn Network>>, | ||
} | ||
|
@@ -486,6 +480,14 @@ where | |
self.provision().await; | ||
} | ||
|
||
#[cfg(feature = "ota")] | ||
{ | ||
if self.storage.has_ota_metadata() { | ||
let metadata = self.storage.get_ota_metadata().unwrap_or_default(); | ||
log::info!("firmware version: {}", metadata.version); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will log the empty string if there is no OTA metadata in NVS. I'm not sure that's the best outcome. Can we be more explicit? |
||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. adding this here avoids complicating config monitor's ota stuff to get the right logging behavior. |
||
|
||
// Since provisioning was run and completed, credentials are properly populated | ||
// if wifi manager is configured loop forever until wifi is connected | ||
if let Some(wifi) = self.wifi_manager.as_ref().as_ref() { | ||
|
@@ -573,48 +575,20 @@ where | |
log::error!("couldn't store the robot configuration reason {:?}", err); | ||
} | ||
|
||
#[cfg(feature = "esp32")] | ||
let hook = || crate::esp32::esp_idf_svc::hal::reset::restart(); | ||
#[cfg(not(feature = "esp32"))] | ||
let hook = || std::process::exit(0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the motivation was to get rid of the coredump we see everytime on config change restarts since I'm already modifying the config_monitor's behavior. I mention it in the description about partially addressing RSDK-9594, but just in config monitor. I can look into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. per offline discussion, changing this back to just use |
||
|
||
let config_monitor_task = Box::new(ConfigMonitor::new( | ||
config.clone(), | ||
self.storage.clone(), | ||
|| std::process::exit(0), | ||
#[cfg(feature = "ota")] | ||
self.executor.clone(), | ||
hook, | ||
)); | ||
self.app_client_tasks.push(config_monitor_task); | ||
|
||
#[cfg(feature = "ota")] | ||
{ | ||
log::debug!("ota feature enabled"); | ||
|
||
if let Some(service) = config | ||
.services | ||
.iter() | ||
.find(|&service| service.model == *ota::OTA_MODEL_TRIPLET) | ||
{ | ||
match ota::OtaService::from_config( | ||
service, | ||
self.storage.clone(), | ||
self.executor.clone(), | ||
) { | ||
Ok(mut ota) => { | ||
self.ota_service_task | ||
.replace(self.executor.spawn(async move { | ||
if let Err(e) = ota.update().await { | ||
log::error!("failed to complete ota update {}", e); | ||
} | ||
})); | ||
} | ||
Err(e) => { | ||
log::error!("failed to build ota service: {}", e.to_string()); | ||
log::error!("ota service config: {:?}", service); | ||
} | ||
}; | ||
} else { | ||
log::error!( | ||
"ota enabled, but no service of type `{}` found in robot config", | ||
ota::OTA_MODEL_TYPE | ||
); | ||
} | ||
} | ||
|
||
let mut robot = LocalRobot::from_cloud_config( | ||
self.executor.clone(), | ||
robot_creds.robot_id.clone(), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,7 +59,8 @@ use thiserror::Error; | |
#[cfg(not(feature = "esp32"))] | ||
use {bincode::Decode, futures_lite::AsyncWriteExt}; | ||
|
||
const CONN_RETRY_SECS: u64 = 60; | ||
const CONN_RETRY_SECS: u64 = 1; | ||
const NUM_RETRY_CONN: usize = 3; | ||
const SIZEOF_APPDESC: usize = 256; | ||
const MAX_VER_LEN: usize = 128; | ||
pub const OTA_MODEL_TYPE: &str = "ota_service"; | ||
|
@@ -156,9 +157,6 @@ impl OtaMetadata { | |
pub fn new(version: String) -> Self { | ||
Self { version } | ||
} | ||
pub(crate) fn version(&self) -> &str { | ||
&self.version | ||
} | ||
} | ||
|
||
pub(crate) struct OtaService<S: OtaMetadataStorage> { | ||
|
@@ -172,6 +170,17 @@ pub(crate) struct OtaService<S: OtaMetadataStorage> { | |
} | ||
|
||
impl<S: OtaMetadataStorage> OtaService<S> { | ||
pub(crate) async fn stored_metadata(&self) -> OtaMetadata { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this function really There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It isn't, but shouldn't it be? The underlying calls are essentially I/O, getting data from flash storage. But this is maybe is more of a "should our storage APIs be async?" question. I'll remove it here though. |
||
if !self.storage.has_ota_metadata() { | ||
log::info!("no ota metadata currently stored in NVS"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: |
||
} | ||
|
||
self.storage | ||
.get_ota_metadata() | ||
.inspect_err(|e| log::warn!("failed to get ota metadata from nvs: {}", e)) | ||
.unwrap_or_default() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why return a default value rather than an error? |
||
} | ||
|
||
pub(crate) fn from_config( | ||
new_config: &ServiceConfig, | ||
storage: S, | ||
|
@@ -259,28 +268,17 @@ impl<S: OtaMetadataStorage> OtaService<S> { | |
}) | ||
} | ||
|
||
pub(crate) async fn update(&mut self) -> Result<(), OtaError> { | ||
let stored_metadata = if !self.storage.has_ota_metadata() { | ||
log::info!("no ota metadata currently stored in NVS"); | ||
OtaMetadata::default() | ||
} else { | ||
self.storage | ||
.get_ota_metadata() | ||
.inspect_err(|e| log::warn!("failed to get ota metadata from nvs: {}", e)) | ||
.unwrap_or_default() | ||
}; | ||
pub(crate) async fn needs_update(&self) -> bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this function really |
||
self.stored_metadata().await.version != self.pending_version | ||
} | ||
|
||
if self.pending_version == stored_metadata.version() { | ||
log::info!("firmware is up-to-date: `{}`", stored_metadata.version); | ||
return Ok(()); | ||
/// Attempts to perform an OTA update. | ||
/// On success, returns an `Ok(true)` or `Ok(false)` indicating if a reboot is necessary. | ||
pub(crate) async fn update(&mut self) -> Result<bool, OtaError> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't particularly love the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you prefer a field in |
||
if !(self.needs_update().await) { | ||
return Ok(false); | ||
} | ||
|
||
log::info!( | ||
"firmware is out of date, proceeding with update from version `{}` to version `{}`", | ||
stored_metadata.version, | ||
self.pending_version | ||
); | ||
|
||
let mut uri = self | ||
.url | ||
.parse::<hyper::Uri>() | ||
|
@@ -304,11 +302,18 @@ impl<S: OtaMetadataStorage> OtaService<S> { | |
uri = hyper::Uri::from_parts(parts).map_err(|e| OtaError::Other(e.to_string()))?; | ||
}; | ||
|
||
let mut num_tries = 0; | ||
let (mut sender, conn) = loop { | ||
if num_tries == NUM_RETRY_CONN { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a necessary part of this change? It seems independent of moving the driver of OTA checks to be the ConfigMonitor. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we don't limit the number of connection attempts, it will block further config changes. So the change is to still have a limited number of short-lived retry attempts in case there's initial wonkiness with the connection. If there is any general wonkiness with establishing the connection, there isn't any guarantee that returning immediately and depending on the next invocation of ConfigMonitor (in 10sec) will resolve it. three one-second retries seemed like a decent middle ground that we can iterate on if anything. |
||
return Err(OtaError::Other( | ||
"failed to establish connection".to_string(), | ||
)); | ||
} | ||
match self.connector.connect_to(&uri) { | ||
Ok(connection) => { | ||
match connection.await { | ||
Ok(io) => { | ||
// TODO(RSDK-9617): add timeout for stalled download | ||
match http2::Builder::new(self.exec.clone()) | ||
.max_frame_size(16_384) // lowest configurable | ||
.timer(H2Timer) | ||
|
@@ -337,6 +342,7 @@ impl<S: OtaMetadataStorage> OtaService<S> { | |
CONN_RETRY_SECS | ||
); | ||
Timer::after(Duration::from_secs(CONN_RETRY_SECS)).await; | ||
num_tries += 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried exploring different paths for a day or two:
I think I tried a couple other things, (like inspect,map_err, ok_and, etc) but it seemed like too much time spend on trying to avoid the messiness and have the correct behavior. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can move the increment to the top of the loop and set the range accordingly |
||
}; | ||
|
||
let conn = Box::new(self.exec.spawn(async move { | ||
|
@@ -520,15 +526,18 @@ impl<S: OtaMetadataStorage> OtaService<S> { | |
.map_err(|e| OtaError::Other(e.to_string()))?; | ||
|
||
log::info!("firmware update complete"); | ||
|
||
// Test experimental ffi accesses here to be recoverable without flashing | ||
// verifies nvs was stored correctly | ||
let curr_metadata = self.stored_metadata().await; | ||
log::info!( | ||
"firmware update successful: version `{}`", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you have this, do you still need the "firmware update complete" line above? |
||
curr_metadata.version | ||
); | ||
// Note: test experimental ota ffi accesses here to be recoverable without flashing | ||
#[cfg(feature = "esp32")] | ||
{ | ||
log::info!("rebooting to load firmware from `{:#x}`", self.address); | ||
// TODO(RSDK-9464): flush logs to app.viam before restarting | ||
esp_idf_svc::hal::reset::restart(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this shifts the responsibility of rebooting to whoever is calling |
||
log::info!("next reboot will load firmware from `{:#x}`", self.address); | ||
} | ||
|
||
Ok(()) | ||
Ok(true) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: inconsistent with how we usually do
cfg
things.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rustfmt trips on it https://github.com/viamrobotics/micro-rdk/actions/runs/12655226567/job/35265116182?pr=369