Skip to content

Commit

Permalink
wip: plumbing timeout state in internal api response
Browse files Browse the repository at this point in the history
  • Loading branch information
lif committed Jan 19, 2024
1 parent 12e2dcb commit b6f0150
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 48 deletions.
28 changes: 20 additions & 8 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use nexus_db_queries::db::datastore::InstanceAndActiveVmm;
use nexus_db_queries::db::identity::Resource;
use nexus_db_queries::db::lookup;
use nexus_db_queries::db::lookup::LookupPath;
use nexus_types::internal_api::views::HandleInstancePutResultResult;
use omicron_common::address::PROPOLIS_PORT;
use omicron_common::api::external::http_pagination::PaginatedBy;
use omicron_common::api::external::ByteCount;
Expand Down Expand Up @@ -140,6 +141,7 @@ impl From<InstanceStateChangeError> for omicron_common::api::external::Error {

/// The kinds of state changes that can be requested of an instance's current
/// VMM (i.e. the VMM pointed to be the instance's `propolis_id` field).
#[derive(Clone)]
pub(crate) enum InstanceStateChangeRequest {
Run,
Reboot,
Expand Down Expand Up @@ -943,7 +945,9 @@ impl super::Nexus {
let instance_put_result = sa
.instance_put_state(
&instance_id,
&InstancePutStateBody { state: requested.into() },
&InstancePutStateBody {
state: requested.clone().into(),
},
)
.await
.map(|res| res.into_inner().updated_runtime.map(Into::into))
Expand Down Expand Up @@ -982,8 +986,8 @@ impl super::Nexus {
tokio::spawn(async {
tokio::time::sleep(Duration::from_secs(120))
.await;
todo: fail instance
})
todo!("fail instance")
});
}
self.write_returned_instance_state(&instance_id, state)
.await
Expand All @@ -1009,17 +1013,25 @@ impl super::Nexus {
opctx: &OpContext,
instance_id: &Uuid,
result: Result<nexus::SledInstanceState, Error>,
) -> Result<(), Error> {
) -> Result<HandleInstancePutResultResult, Error> {
let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore)
.instance_id(*instance_id)
.lookup_for(authz::Action::Modify)
.await?;

match result {
Ok(new_state) => self
.write_returned_instance_state(instance_id, Some(new_state))
.await
.map(|_| ()),
Ok(new_state) => {
let (inst_updated, vmm_updated) = self
.write_returned_instance_state(instance_id, Some(new_state))
.await?;
if !inst_updated && !vmm_updated {
// the generation number bumped up by the timeout task.
// TODO check actual state / put it in the TimedOut variant?
Ok(HandleInstancePutResultResult::TimedOut)
} else {
Ok(HandleInstancePutResultResult::Ok)
}
}
Err(error) => {
let state = self
.db_datastore
Expand Down
29 changes: 6 additions & 23 deletions nexus/src/internal_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,20 +304,11 @@ async fn cpapi_handle_instance_put_success(
// TODO: if instance_handle_creation_result errors because nexus gave
// up waiting and marked the instance as failed, tell sled-agent to
// destroy the instance
match nexus
nexus
.instance_handle_creation_result(&opctx, &path.instance_id, result)
.await
{
Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()),
Err(err) => {
if todo {
Ok(HandleInstancePutResultResult::TimedOut.into())
} else {
Err(err)
}
}
}
Ok(HttpResponseUpdatedNoContent())
.map(HttpResponseOk)
.map_err(Into::into)
};
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}
Expand All @@ -341,19 +332,11 @@ async fn cpapi_handle_instance_put_failure(
});
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
let handler = async {
match nexus
nexus
.instance_handle_creation_result(&opctx, &path.instance_id, result)
.await
{
Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()),
Err(err) => {
if todo {
Ok(HandleInstancePutResultResult::TimedOut.into())
} else {
Err(err)
}
}
}
.map(HttpResponseOk)
.map_err(Into::into)
};
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
}
Expand Down
1 change: 1 addition & 0 deletions nexus/types/src/internal_api/views.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ pub struct LastResultCompleted {
/// relevant, or if it's timed out during creation and been marked as failed
/// (such that sled-agent can destroy the tardy instance)
#[derive(Serialize, JsonSchema)]
#[serde(rename_all = "snake_case", tag = "last_result", content = "details")]
pub enum HandleInstancePutResultResult {
Ok,
TimedOut,
Expand Down
63 changes: 55 additions & 8 deletions openapi/nexus-internal.json
Original file line number Diff line number Diff line change
Expand Up @@ -466,8 +466,8 @@
},
"/instances/{instance_id}/creation-failure": {
"put": {
"summary": "Asynchronously report the unsuccessful result of a potentially long-running",
"description": "instance_put call to sled-agent made during instance creation.",
"summary": "Asynchronously report the unsuccessful result of certain instance_put calls",
"description": "(such as the potentially long-running one made during instance creation)",
"operationId": "cpapi_handle_instance_put_failure",
"parameters": [
{
Expand All @@ -492,8 +492,15 @@
"required": true
},
"responses": {
"204": {
"description": "resource updated"
"200": {
"description": "successful operation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HandleInstancePutResultResult"
}
}
}
},
"4XX": {
"$ref": "#/components/responses/Error"
Expand All @@ -506,8 +513,8 @@
},
"/instances/{instance_id}/creation-success": {
"put": {
"summary": "Asynchronously report the successful result of a potentially long-running",
"description": "instance_put call to sled-agent made during instance creation.",
"summary": "Asynchronously report the successful result of certain instance_put calls",
"description": "(such as the potentially long-running one made during instance creation)",
"operationId": "cpapi_handle_instance_put_success",
"parameters": [
{
Expand All @@ -531,8 +538,15 @@
"required": true
},
"responses": {
"204": {
"description": "resource updated"
"200": {
"description": "successful operation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HandleInstancePutResultResult"
}
}
}
},
"4XX": {
"$ref": "#/components/responses/Error"
Expand Down Expand Up @@ -3753,6 +3767,39 @@
"format": "uint64",
"minimum": 0
},
"HandleInstancePutResultResult": {
"description": "Tells sled-agent whether an instance whose status it's reporting is still relevant, or if it's timed out during creation and been marked as failed (such that sled-agent can destroy the tardy instance)",
"oneOf": [
{
"type": "object",
"properties": {
"last_result": {
"type": "string",
"enum": [
"ok"
]
}
},
"required": [
"last_result"
]
},
{
"type": "object",
"properties": {
"last_result": {
"type": "string",
"enum": [
"timed_out"
]
}
},
"required": [
"last_result"
]
}
]
},
"HistogramError": {
"description": "Errors related to constructing histograms or adding samples into them.",
"oneOf": [
Expand Down
28 changes: 19 additions & 9 deletions sled-agent/src/instance_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use illumos_utils::link::VnicAllocator;
use illumos_utils::opte::PortManager;
use illumos_utils::running_zone::ZoneBuilderFactory;
use illumos_utils::vmm_reservoir;
use nexus_client::types::HandleInstancePutResultResult;
use omicron_common::api::external::ByteCount;
use omicron_common::api::internal::nexus::InstanceRuntimeState;
use omicron_common::api::internal::nexus::SledInstanceState;
Expand Down Expand Up @@ -382,10 +383,12 @@ impl InstanceManager {
&instance_state,
)
.await
.map(nexus_client::ResponseValue::into_inner)
{
Ok(
TODO HandleInstancePutResultResult::Ok
) => {}
Ok(HandleInstancePutResultResult::Ok) => {}
Ok(HandleInstancePutResultResult::TimedOut) => {
todo!("nexus doesn't want us any more, terminate instance")
}
Err(err) => {
error!(log, "Failed to inform Nexus of instance_put success";
"err" => %err,
Expand All @@ -396,18 +399,25 @@ impl InstanceManager {
}
}
Err(instance_put_error) => {
if let Err(err) = nexus_client
match nexus_client
.cpapi_handle_instance_put_failure(
&instance_id,
&instance_put_error.to_string(),
)
.await
.map(nexus_client::ResponseValue::into_inner)
{
error!(log, "Failed to inform Nexus of instance_put failure";
"err" => %err,
"instance_id" => %instance_id,
"instance_put_error" => %instance_put_error,
);
Ok(HandleInstancePutResultResult::Ok) => {}
Ok(HandleInstancePutResultResult::TimedOut) => {
todo!("well, i guess this is less awkward but clean up if we have to")
}
Err(err) => {
error!(log, "Failed to inform Nexus of instance_put failure";
"err" => %err,
"instance_id" => %instance_id,
"instance_put_error" => %instance_put_error,
);
}
}
}
};
Expand Down

0 comments on commit b6f0150

Please sign in to comment.