Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(upgrader): handle unstoppable station in disaster recovery #389

Draft
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions core/upgrader/api/spec.did
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ type RequestDisasterRecoveryInput = record {
arg : blob;
// The install mode: Install, Upgrade, or Reinstall.
install_mode : InstallMode;
// Should the station be forced to stop if it is unstoppable.
force : bool;
};

type InstallMode = variant {
Expand Down
2 changes: 2 additions & 0 deletions core/upgrader/api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ pub struct RequestDisasterRecoveryInput {
pub arg: Vec<u8>,

pub install_mode: InstallMode,

pub force: bool,
}

#[derive(CandidType, Deserialize, Debug, Clone)]
Expand Down
2 changes: 2 additions & 0 deletions core/upgrader/impl/src/model/disaster_recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ pub struct StationRecoveryRequest {
pub arg_sha256: Vec<u8>,
/// Time in nanoseconds since the UNIX epoch when the request was submitted.
pub submitted_at: Timestamp,
/// Should the station be forced to stop if the station is unstoppable.
pub force: bool,
}

impl From<StationRecoveryRequest> for upgrader_api::StationRecoveryRequest {
Expand Down
141 changes: 81 additions & 60 deletions core/upgrader/impl/src/services/disaster_recovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@ use crate::{
upgrader_ic_cdk::{api::time, spawn},
};
use candid::Principal;
use ic_cdk::api::management_canister::main::{
list_canister_snapshots, load_canister_snapshot, take_canister_snapshot, uninstall_code,
CanisterIdRecord, LoadCanisterSnapshotArgs, TakeCanisterSnapshotArgs,
};
use ic_stable_structures::memory_manager::MemoryId;
use lazy_static::lazy_static;
use orbit_essentials::{api::ServiceResult, utils::sha256_hash};

use crate::{
model::{
Account, AdminUser, DisasterRecovery, DisasterRecoveryCommittee, InstallMode,
RecoveryEvaluationResult, RecoveryFailure, RecoveryResult, RecoveryStatus,
StationRecoveryRequest,
Account, AdminUser, DisasterRecovery, DisasterRecoveryCommittee, RecoveryEvaluationResult,
RecoveryFailure, RecoveryResult, RecoveryStatus, StationRecoveryRequest,
},
StableValue, MEMORY_ID_DISASTER_RECOVERY, MEMORY_MANAGER, TARGET_CANISTER_ID,
};
Expand Down Expand Up @@ -55,39 +58,39 @@ pub struct DisasterRecoveryReleaser {

impl Drop for DisasterRecoveryReleaser {
fn drop(&mut self) {
let mut value = self.storage.get();

let last_recovery_result =
self.result
.take()
.unwrap_or(RecoveryResult::Failure(RecoveryFailure {
reason: "Recovery failed with unknown error".to_string(),
}));

value.last_recovery_result = Some(last_recovery_result.clone());
value.recovery_status = RecoveryStatus::Idle;

self.logger.log(LogEntryType::DisasterRecoveryResult(
DisasterRecoveryResultLog {
result: last_recovery_result,
result: last_recovery_result.clone(),
},
));

self.storage.set(value);
self.storage.set_result(last_recovery_result);
}
}

#[derive(Clone, Default)]
pub struct DisasterRecoveryStorage {}

impl DisasterRecoveryStorage {
pub fn get(&self) -> DisasterRecovery {
fn get(&self) -> DisasterRecovery {
STORAGE.with(|storage| storage.borrow().get(&()).unwrap_or_default())
}

fn set(&self, value: DisasterRecovery) {
STORAGE.with(|storage| storage.borrow_mut().insert((), value));
}

fn set_result(&self, result: RecoveryResult) {
let mut value = self.get();
value.last_recovery_result = Some(result);
value.recovery_status = RecoveryStatus::Idle;
self.set(value);
}
}

#[derive(Clone)]
Expand Down Expand Up @@ -233,7 +236,7 @@ impl DisasterRecoveryService {
installer: Arc<dyn InstallCanister>,
logger: Arc<LoggerService>,
request: StationRecoveryRequest,
) {
) -> Result<(), String> {
let mut value = storage.get();

logger.log(LogEntryType::DisasterRecoveryStart(
Expand All @@ -251,63 +254,75 @@ impl DisasterRecoveryService {

if since + DISASTER_RECOVERY_IN_PROGESS_EXPIRATION_NS > time() {
logger.log(LogEntryType::DisasterRecoveryInProgress(log));
return;
return Ok(());
}

logger.log(LogEntryType::DisasterRecoveryInProgressExpired(log));
value.recovery_status = RecoveryStatus::Idle;
}

let Some(station_canister_id) =
TARGET_CANISTER_ID.with(|id| id.borrow().get(&()).map(|id| id.0))
else {
value.last_recovery_result = Some(RecoveryResult::Failure(RecoveryFailure {
reason: "Station canister ID not set".to_string(),
}));
storage.set(value);
return;
};

value.recovery_status = RecoveryStatus::InProgress { since: time() };
storage.set(value);

let station_canister_id = TARGET_CANISTER_ID
.with(|id| id.borrow().get(&()).map(|id| id.0))
.ok_or("Station canister ID not set")?;

let mut releaser = DisasterRecoveryReleaser {
storage: storage.clone(),
result: None,
logger: logger.clone(),
};

// only stop for upgrade
if request.install_mode == InstallMode::Upgrade {
if let Err(err) = installer.stop(station_canister_id).await {
ic_cdk::print(err);
if let Err(err) = installer.stop(station_canister_id).await {
if request.force {
let existing_snapshots = list_canister_snapshots(CanisterIdRecord {
canister_id: station_canister_id,
})
.await
.map_err(|(_code, msg)| msg)?
.0;
let snapshot = take_canister_snapshot(TakeCanisterSnapshotArgs {
canister_id: station_canister_id,
replace_snapshot: existing_snapshots
.into_iter()
.next()
.map(|snapshot| snapshot.id),
})
.await
.map_err(|(_code, msg)| msg)?
.0;
uninstall_code(CanisterIdRecord {
canister_id: station_canister_id,
})
.await
.map_err(|(_code, msg)| msg)?;
load_canister_snapshot(LoadCanisterSnapshotArgs {
canister_id: station_canister_id,
snapshot_id: snapshot.id,
sender_canister_version: None,
})
.await
.map_err(|(_code, msg)| msg)?;
Comment on lines +285 to +306
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this would fail for some reason (e.g. because maybe the target canister would not have enough cycles), then the disaster recovery committee would need to call this endpoint again, however, because we replace that snapshot, then the second try would take a snapshot of an empty canister because we would have already uninstalled it.

To account for such cases, maybe we should store a flag in the upgrader that makes it reuse the same snapshot id on a retry, wdyt?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great catch! I'd suggest the following:

  • support taking and restoring snapshots manually in disaster recovery
  • never take a snapshot automatically: in the force mode a manually created snapshot must be specified, i.e., force: bool becomes force: Option<SnapshotId>

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both look like a safer approach, and in the case of force: Option<SnapshotId>, if provided as None we would then create the snapshot? as it is the behaviour for force=true

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

force: None would correspond to force: false and force: Some(snapshot_id) would correspond to force: true

} else {
return Err(err);
}
}

match installer
installer
.install(
station_canister_id,
request.wasm_module,
request.wasm_module_extra_chunks,
request.arg,
request.install_mode,
)
.await
{
Ok(_) => {
releaser.result = Some(RecoveryResult::Success);
}
Err(reason) => {
releaser.result = Some(RecoveryResult::Failure(RecoveryFailure { reason }));
}
}
.await?;

// only start for upgrade
if request.install_mode == InstallMode::Upgrade {
if let Err(err) = installer.start(station_canister_id).await {
ic_cdk::print(err);
}
}
installer.start(station_canister_id).await?;

releaser.result = Some(RecoveryResult::Success);
Ok(())
}

pub fn request_recovery(
Expand All @@ -332,6 +347,7 @@ impl DisasterRecoveryService {
arg: request.arg,
submitted_at: time(),
install_mode: request.install_mode.into(),
force: request.force,
};

// check if user had previous recovery request
Expand Down Expand Up @@ -365,7 +381,11 @@ impl DisasterRecoveryService {
let logger = self.logger.clone();

spawn(async move {
Self::do_recovery(storage, installer, logger, *request).await;
if let Err(reason) =
Self::do_recovery(storage.clone(), installer, logger, *request).await
{
storage.set_result(RecoveryResult::Failure(RecoveryFailure { reason }));
}
});
}
}
Expand Down Expand Up @@ -489,6 +509,7 @@ mod test {
module: vec![4, 5, 6],
module_extra_chunks: None,
install_mode: upgrader_api::InstallMode::Upgrade,
force: false,
},
);
assert!(dr.storage.get().recovery_requests.is_empty());
Expand All @@ -501,6 +522,7 @@ mod test {
module: vec![4, 5, 6],
module_extra_chunks: None,
install_mode: upgrader_api::InstallMode::Upgrade,
force: false,
},
);

Expand All @@ -517,6 +539,7 @@ mod test {
module: vec![4, 5, 6],
module_extra_chunks: None,
install_mode: upgrader_api::InstallMode::Upgrade,
force: false,
},
);

Expand All @@ -531,6 +554,7 @@ mod test {
module: vec![4, 5, 6],
module_extra_chunks: None,
install_mode: upgrader_api::InstallMode::Upgrade,
force: false,
},
);

Expand Down Expand Up @@ -567,6 +591,7 @@ mod test {
arg: vec![7, 8, 9],
arg_sha256: vec![10, 11, 12],
submitted_at: 0,
force: false,
};

// assert that during install the state is set to InProgress
Expand All @@ -587,7 +612,8 @@ mod test {
logger.clone(),
recovery_request.clone(),
)
.await;
.await
.unwrap();

// calls install in Idle state
assert_eq!(
Expand Down Expand Up @@ -620,7 +646,8 @@ mod test {
logger.clone(),
recovery_request.clone(),
)
.await;
.await
.unwrap();

// does not call install in InProgress state
assert_eq!(
Expand All @@ -646,6 +673,7 @@ mod test {
arg: vec![7, 8, 9],
arg_sha256: vec![10, 11, 12],
submitted_at: 0,
force: false,
};

let installer = Arc::new(TestInstaller::default());
Expand All @@ -656,17 +684,8 @@ mod test {
logger.clone(),
recovery_request.clone(),
)
.await;

assert!(matches!(
storage.get().last_recovery_result,
Some(RecoveryResult::Failure(_))
));

assert!(matches!(
storage.get().recovery_status,
RecoveryStatus::Idle
));
.await
.unwrap_err();
}

#[tokio::test]
Expand All @@ -687,6 +706,7 @@ mod test {
arg: vec![7, 8, 9],
arg_sha256: vec![10, 11, 12],
submitted_at: 0,
force: false,
};

let installer = Arc::new(PanickingTestInstaller::default());
Expand All @@ -700,7 +720,8 @@ mod test {
logger.clone(),
recovery_request.clone(),
)
.await;
.await
.unwrap();

// reset the hook
let _ = take_hook();
Expand Down
2 changes: 1 addition & 1 deletion scripts/run-integration-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ echo "PocketIC download starting"
curl -sLO https://github.com/dfinity/pocketic/releases/download/7.0.0/pocket-ic-x86_64-$PLATFORM.gz || exit 1
gzip -df pocket-ic-x86_64-$PLATFORM.gz
mv pocket-ic-x86_64-$PLATFORM pocket-ic
export POCKET_IC_BIN="$(pwd)/pocket-ic"
chmod +x pocket-ic
export POCKET_IC_BIN="$(pwd)/pocket-ic"
echo "PocketIC download completed"
cd ../..

Expand Down
2 changes: 2 additions & 0 deletions tests/canister/api/spec.did
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ service : {
validate_number : (input : StoreNumberInput) -> (ValidationResponse);
store_number : (input : StoreNumberInput) -> ();
get_number : () -> (nat64) query;
noop : () -> ();
expensive : () -> ();
};
15 changes: 14 additions & 1 deletion tests/canister/impl/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use candid::{CandidType, Deserialize, Principal};
use futures::future::join_all;
use ic_cdk::api::call::call_raw;
use ic_cdk::{query, update};
use ic_cdk::api::performance_counter;
use ic_cdk::{id, query, update};

thread_local! {
static NUMBER: std::cell::RefCell<u64> = const { std::cell::RefCell::new(0) };
Expand Down Expand Up @@ -55,6 +56,18 @@ async fn get_number() -> u64 {
NUMBER.with(|n| *n.borrow())
}

#[update]
async fn noop() {}

#[update]
async fn expensive() {
loop {
if performance_counter(0) >= 19_000_000_000 {
ic_cdk::call::<_, ()>(id(), "noop", ((),)).await.unwrap();
}
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
Loading
Loading