Skip to content

Commit

Permalink
sled-agent: Don't block during instance creation request from nexus (#…
Browse files Browse the repository at this point in the history
…4691)

Alleviating request timeouts occurring when propolis zone installation
takes too long - #3927 - by making the zone installation not happen
during a request handler, and instead having sled-agent report the
result of instance creation back to nexus with an internal cpapi call.

In this world where instance create calls to nexus now no longer block
on sled-agent's propolis zone installation finishing, care must be taken
in e.g. integration tests not to assume you can, say, connect to the
instance as soon as the instance create call returns.

Accordingly, integration tests have been changed to poll until the
instance is ready, and additionally some unit tests have been added
for instance creation in sled-agent's Instance and InstanceManager.
  • Loading branch information
lifning authored Mar 19, 2024
1 parent 89e7b42 commit f8c8bb1
Show file tree
Hide file tree
Showing 11 changed files with 912 additions and 24 deletions.
70 changes: 56 additions & 14 deletions end-to-end-tests/src/instance_launch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@ use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError};
use oxide_client::types::{
ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate,
InstanceCpuCount, InstanceCreate, InstanceDiskAttachment,
InstanceNetworkInterfaceAttachment, SshKeyCreate,
InstanceNetworkInterfaceAttachment, InstanceState, SshKeyCreate,
};
use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt};
use russh::{ChannelMsg, Disconnect};
use russh_keys::key::{KeyPair, PublicKey};
use russh_keys::PublicKeyBase64;
use std::sync::Arc;
use std::time::Duration;
use tokio::time::sleep;

#[tokio::test]
async fn instance_launch() -> Result<()> {
Expand Down Expand Up @@ -106,6 +105,19 @@ async fn instance_launch() -> Result<()> {
type Error =
CondCheckError<oxide_client::Error<oxide_client::types::Error>>;

let instance_state = ctx
.client
.instance_view()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.send()
.await?
.run_state;

if instance_state == InstanceState::Starting {
return Err(Error::NotYet);
}

let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
Expand Down Expand Up @@ -188,19 +200,49 @@ async fn instance_launch() -> Result<()> {

// check that we saw it on the console
eprintln!("waiting for serial console");
sleep(Duration::from_secs(5)).await;
let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.most_recent(1024 * 1024)
.max_bytes(1024 * 1024)
.send()
.await?
.data,

let data = wait_for_condition(
|| async {
type Error =
CondCheckError<oxide_client::Error<oxide_client::types::Error>>;

let instance_state = ctx
.client
.instance_view()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.send()
.await?
.run_state;

if instance_state == InstanceState::Starting {
return Err(Error::NotYet);
}

let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.most_recent(1024 * 1024)
.max_bytes(1024 * 1024)
.send()
.await
.map_err(|_e| Error::NotYet)?
.data,
)
.into_owned();
if data.contains("-----END SSH HOST KEY KEYS-----") {
Ok(data)
} else {
Err(Error::NotYet)
}
},
&Duration::from_secs(5),
&Duration::from_secs(300),
)
.into_owned();
.await?;

ensure!(
data.contains("Hello, Oxide!"),
"string not seen on console\n{}",
Expand Down
48 changes: 46 additions & 2 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1193,29 +1193,55 @@ impl ZoneBuilderFactory {
/// Created by [ZoneBuilderFactory].
#[derive(Default)]
pub struct ZoneBuilder<'a> {
/// Logger to which status messages are written during zone installation.
log: Option<Logger>,
/// Allocates the NIC used for control plane communication.
underlay_vnic_allocator: Option<&'a VnicAllocator<Etherstub>>,
/// Filesystem path at which the installed zone will reside.
zone_root_path: Option<&'a Utf8Path>,
/// The directories that will be searched for the image tarball for the
/// provided zone type ([`Self::with_zone_type`]).
zone_image_paths: Option<&'a [Utf8PathBuf]>,
/// The name of the type of zone being created (e.g. "propolis-server")
zone_type: Option<&'a str>,
unique_name: Option<Uuid>, // actually optional
/// Unique ID of the instance of the zone being created. (optional)
// *actually* optional (in contrast to other fields that are `Option` for
// builder purposes - that is, skipping this field in the builder will
// still result in an `Ok(InstalledZone)` from `.install()`, rather than
// an `Err(InstallZoneError::IncompleteBuilder)`.
unique_name: Option<Uuid>,
/// ZFS datasets to be accessed from within the zone.
datasets: Option<&'a [zone::Dataset]>,
/// Filesystems to mount within the zone.
filesystems: Option<&'a [zone::Fs]>,
/// Additional network device names to add to the zone.
data_links: Option<&'a [String]>,
/// Device nodes to pass through to the zone.
devices: Option<&'a [zone::Device]>,
/// OPTE devices for the guest network interfaces.
opte_ports: Option<Vec<(Port, PortTicket)>>,
bootstrap_vnic: Option<Link>, // actually optional
/// NIC to use for creating a bootstrap address on the switch zone.
// actually optional (as above)
bootstrap_vnic: Option<Link>,
/// Physical NICs possibly provisioned to the zone.
links: Option<Vec<Link>>,
/// The maximum set of privileges any process in this zone can obtain.
limit_priv: Option<Vec<String>>,
/// For unit tests only: if `Some`, then no actual zones will be installed
/// by this builder, and minimal facsimiles of them will be placed in
/// temporary directories according to the contents of the provided
/// `FakeZoneBuilderConfig`.
fake_cfg: Option<FakeZoneBuilderConfig>,
}

impl<'a> ZoneBuilder<'a> {
/// Logger to which status messages are written during zone installation.
pub fn with_log(mut self, log: Logger) -> Self {
self.log = Some(log);
self
}

/// Allocates the NIC used for control plane communication.
pub fn with_underlay_vnic_allocator(
mut self,
vnic_allocator: &'a VnicAllocator<Etherstub>,
Expand All @@ -1224,11 +1250,14 @@ impl<'a> ZoneBuilder<'a> {
self
}

/// Filesystem path at which the installed zone will reside.
pub fn with_zone_root_path(mut self, root_path: &'a Utf8Path) -> Self {
self.zone_root_path = Some(root_path);
self
}

/// The directories that will be searched for the image tarball for the
/// provided zone type ([`Self::with_zone_type`]).
pub fn with_zone_image_paths(
mut self,
image_paths: &'a [Utf8PathBuf],
Expand All @@ -1237,56 +1266,68 @@ impl<'a> ZoneBuilder<'a> {
self
}

/// The name of the type of zone being created (e.g. "propolis-server")
pub fn with_zone_type(mut self, zone_type: &'a str) -> Self {
self.zone_type = Some(zone_type);
self
}

/// Unique ID of the instance of the zone being created. (optional)
pub fn with_unique_name(mut self, uuid: Uuid) -> Self {
self.unique_name = Some(uuid);
self
}

/// ZFS datasets to be accessed from within the zone.
pub fn with_datasets(mut self, datasets: &'a [zone::Dataset]) -> Self {
self.datasets = Some(datasets);
self
}

/// Filesystems to mount within the zone.
pub fn with_filesystems(mut self, filesystems: &'a [zone::Fs]) -> Self {
self.filesystems = Some(filesystems);
self
}

/// Additional network device names to add to the zone.
pub fn with_data_links(mut self, links: &'a [String]) -> Self {
self.data_links = Some(links);
self
}

/// Device nodes to pass through to the zone.
pub fn with_devices(mut self, devices: &'a [zone::Device]) -> Self {
self.devices = Some(devices);
self
}

/// OPTE devices for the guest network interfaces.
pub fn with_opte_ports(mut self, ports: Vec<(Port, PortTicket)>) -> Self {
self.opte_ports = Some(ports);
self
}

/// NIC to use for creating a bootstrap address on the switch zone.
/// (optional)
pub fn with_bootstrap_vnic(mut self, vnic: Link) -> Self {
self.bootstrap_vnic = Some(vnic);
self
}

/// Physical NICs possibly provisioned to the zone.
pub fn with_links(mut self, links: Vec<Link>) -> Self {
self.links = Some(links);
self
}

/// The maximum set of privileges any process in this zone can obtain.
pub fn with_limit_priv(mut self, limit_priv: Vec<String>) -> Self {
self.limit_priv = Some(limit_priv);
self
}

// (used in unit tests)
fn fake_install(self) -> Result<InstalledZone, InstallZoneError> {
let zone = self
.zone_type
Expand Down Expand Up @@ -1324,6 +1365,9 @@ impl<'a> ZoneBuilder<'a> {
.ok_or(InstallZoneError::IncompleteBuilder)
}

/// Create the zone with the provided parameters.
/// Returns `Err(InstallZoneError::IncompleteBuilder)` if a necessary
/// parameter was not provided.
pub async fn install(self) -> Result<InstalledZone, InstallZoneError> {
if self.fake_cfg.is_some() {
return self.fake_install();
Expand Down
3 changes: 3 additions & 0 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,9 @@ impl super::Nexus {
//
// If the operation failed, kick the sled agent error back up to
// the caller to let it decide how to handle it.
//
// When creating the zone for the first time, we just get
// Ok(None) here, which is a no-op in write_returned_instance_state.
match instance_put_result {
Ok(state) => self
.write_returned_instance_state(&instance_id, state)
Expand Down
21 changes: 18 additions & 3 deletions nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ use sled_agent_client::TestInterfaces as _;
use std::convert::TryFrom;
use std::net::Ipv4Addr;
use std::sync::Arc;
use std::time::Duration;
use uuid::Uuid;

use dropshot::test_util::ClientTestContext;
Expand All @@ -81,6 +82,8 @@ use nexus_test_utils::resource_helpers::{
use nexus_test_utils_macros::nexus_test;
use nexus_types::external_api::shared::SiloRole;
use omicron_sled_agent::sim;
use omicron_test_utils::dev::poll;
use omicron_test_utils::dev::poll::CondCheckError;

type ControlPlaneTestContext =
nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;
Expand Down Expand Up @@ -3795,10 +3798,22 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) {

// Create an instance and poke it to ensure it's running.
let instance = create_instance(client, PROJECT_NAME, instance_name).await;
instance_simulate(nexus, &instance.identity.id).await;
let instance_next = instance_get(&client, &instance_url).await;
let instance_next = poll::wait_for_condition(
|| async {
instance_simulate(nexus, &instance.identity.id).await;
let instance_next = instance_get(&client, &instance_url).await;
if instance_next.runtime.run_state == InstanceState::Running {
Ok(instance_next)
} else {
Err(CondCheckError::<()>::NotYet)
}
},
&Duration::from_secs(5),
&Duration::from_secs(60),
)
.await
.unwrap();
identity_eq(&instance.identity, &instance_next.identity);
assert_eq!(instance_next.runtime.run_state, InstanceState::Running);
assert!(
instance_next.runtime.time_run_state_updated
> instance.runtime.time_run_state_updated
Expand Down
34 changes: 33 additions & 1 deletion sled-agent/src/fakes/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ use hyper::Body;
use internal_dns::ServiceName;
use nexus_client::types::SledAgentInfo;
use omicron_common::api::external::Error;
use omicron_common::api::internal::nexus::UpdateArtifactId;
use omicron_common::api::internal::nexus::{
SledInstanceState, UpdateArtifactId,
};
use schemars::JsonSchema;
use serde::Deserialize;
use uuid::Uuid;
Expand Down Expand Up @@ -44,6 +46,14 @@ pub trait FakeNexusServer: Send + Sync {
) -> Result<(), Error> {
Err(Error::internal_error("Not implemented"))
}

fn cpapi_instances_put(
&self,
_instance_id: Uuid,
_new_runtime_state: SledInstanceState,
) -> Result<(), Error> {
Err(Error::internal_error("Not implemented"))
}
}

/// Describes the server context type.
Expand Down Expand Up @@ -107,11 +117,33 @@ async fn sled_agent_put(
Ok(HttpResponseUpdatedNoContent())
}

#[derive(Deserialize, JsonSchema)]
struct InstancePathParam {
instance_id: Uuid,
}
#[endpoint {
method = PUT,
path = "/instances/{instance_id}",
}]
async fn cpapi_instances_put(
request_context: RequestContext<ServerContext>,
path_params: Path<InstancePathParam>,
new_runtime_state: TypedBody<SledInstanceState>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
let context = request_context.context();
context.cpapi_instances_put(
path_params.into_inner().instance_id,
new_runtime_state.into_inner(),
)?;
Ok(HttpResponseUpdatedNoContent())
}

fn api() -> ApiDescription<ServerContext> {
let mut api = ApiDescription::new();
api.register(cpapi_artifact_download).unwrap();
api.register(sled_agent_get).unwrap();
api.register(sled_agent_put).unwrap();
api.register(cpapi_instances_put).unwrap();
api
}

Expand Down
Loading

0 comments on commit f8c8bb1

Please sign in to comment.