Skip to content

Commit

Permalink
Fix flaky instance zone-boot timeout test
Browse files Browse the repository at this point in the history
- Fixes #6197
- Use a multi-threaded runtime for the test so we can literally sleep in
  the boot closure without blocking the test itself.
  • Loading branch information
bnaecker committed Aug 6, 2024
1 parent 30edf1c commit e45b7f1
Showing 1 changed file with 40 additions and 24 deletions.
64 changes: 40 additions & 24 deletions sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1617,6 +1617,7 @@ mod tests {
use std::net::Ipv6Addr;
use std::net::SocketAddrV6;
use std::str::FromStr;
use std::time::Duration;
use tokio::sync::watch::Receiver;
use tokio::time::timeout;

Expand Down Expand Up @@ -2043,7 +2044,7 @@ mod tests {
logctx.cleanup_successful();
}

#[tokio::test]
#[tokio::test(flavor = "multi_thread")]
async fn test_instance_create_timeout_while_creating_zone() {
let logctx = omicron_test_utils::dev::test_setup_log(
"test_instance_create_timeout_while_creating_zone",
Expand All @@ -2055,17 +2056,30 @@ mod tests {

// time out while booting zone, on purpose!
let boot_ctx = MockZones::boot_context();
let start = tokio::time::Instant::now();
const TIMEOUT: Duration = Duration::from_secs(1);
let (boot_continued_tx, boot_continued_rx) =
std::sync::mpsc::sync_channel(1);
boot_ctx.expect().times(1).return_once(move |_| {
// We need something that will look like the zone taking a long time
// to boot, but we cannot use a `tokio::time` construct here since
// this is a blocking context and we cannot call `block_on()`
// recursively. We advance time by this amount below, so this will
// most likely result in a small number of additional sleeps until
// the timeout has really elased.
while start.elapsed() < TIMEOUT_DURATION * 2 {
std::thread::sleep(std::time::Duration::from_millis(1));
}
// We need a way to slow down zone boot, but that doesn't block the
// entire Tokio runtime. Since this closure is synchronous, it also
// has no way to await anything, all waits are blocking. That means
// we cannot use a single-threaded runtime, which also means no
// manually advancing time. The test has to take the full "slow boot
// time".
//
// To do this, we use a multi-threaded runtime, and call
// block_in_place so that we can just literally sleep for a while.
// The sleep duration here is twice a timeout we set on the attempt
// to actually set the instance running below.
//
// This boot method also directly signals the main test code to
// continue when it's done sleeping to synchronize with it.
tokio::task::block_in_place(move || {
println!("MockZones::boot() called, waiting for timeout");
std::thread::sleep(TIMEOUT * 2);
println!("MockZones::boot() waited for timeout, continuing");
boot_continued_tx.send(()).unwrap();
});
Ok(())
});
let wait_ctx = illumos_utils::svc::wait_for_service_context();
Expand Down Expand Up @@ -2100,8 +2114,6 @@ mod tests {
.await
.expect("timed out creating Instance struct");

tokio::time::pause();

let (put_tx, put_rx) = oneshot::channel();

// pretending we're InstanceManager::ensure_state, try in vain to start
Expand All @@ -2110,20 +2122,16 @@ mod tests {
.await
.expect("failed to send Instance::put_state");

// Timeout our future waiting for the instance-state-change at
// `TIMEOUT_DURATION`, which should fail because zone boot will take
// twice that by construction.
let timeout_fut = timeout(TIMEOUT_DURATION, put_rx);

// And advance time by twice that, so that the actual
// `MockZones::boot()` call should be exercised (or will be soon).
tokio::time::advance(TIMEOUT_DURATION * 2).await;

tokio::time::resume();

// Timeout our future waiting for the instance-state-change at 1s. This
// is much shorter than the actual `TIMEOUT_DURATION`, but the test
// structure requires that we actually wait this period, since we cannot
// advance time manually in a multi-threaded runtime.
let timeout_fut = timeout(TIMEOUT, put_rx);
println!("Awaiting zone-boot timeout");
timeout_fut
.await
.expect_err("*should've* timed out waiting for Instance::put_state, but didn't?");
println!("Zone-boot timeout awaited");

if let ReceivedInstanceState::InstancePut(SledInstanceState {
vmm_state: VmmRuntimeState { state: VmmState::Running, .. },
Expand All @@ -2133,6 +2141,14 @@ mod tests {
panic!("Nexus's InstanceState should never have reached running if zone creation timed out");
}

// Notify the "boot" closure that it can continue, and then wait to
// ensure it's actually called.
println!("Waiting for zone-boot to continue");
tokio::task::spawn_blocking(move || boot_continued_rx.recv().unwrap())
.await
.unwrap();
println!("Received continued message from MockZones::boot()");

storage_harness.cleanup().await;
logctx.cleanup_successful();
}
Expand Down

0 comments on commit e45b7f1

Please sign in to comment.