From 261d79f3934de6b100dbd3479c4e212ae2e2bacd Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Fri, 15 Nov 2024 12:46:27 +0100 Subject: [PATCH] [FINAL] feat: Best-effort responses --- docs/references/ic-interface-spec.md | 185 +++++++++++++++++++++++---- 1 file changed, 159 insertions(+), 26 deletions(-) diff --git a/docs/references/ic-interface-spec.md b/docs/references/ic-interface-spec.md index 81fa75be62..98685eaa7e 100644 --- a/docs/references/ic-interface-spec.md +++ b/docs/references/ic-interface-spec.md @@ -127,6 +127,10 @@ This specification may refer to certain constants and limits without specifying Maximum wall clock time spent on evaluation of a query call. +- `MAX_CALL_TIMEOUT` + + The maximum timeout (in seconds) for an inter-canister call. + ### Principals {#principal} Principals are generic identifiers for canisters, users and possibly other concepts in the future. As far as most uses of the IC are concerned they are *opaque* binary blobs with a length between 0 and 29 bytes, and there is intentionally no mechanism to tell canister ids and user ids apart. @@ -1109,6 +1113,8 @@ Rejection codes are member of the following enumeration: - `CANISTER_ERROR` (5): Canister error (e.g., trap, no response) +- `SYS_UNKNOWN` (6): Response unknown; system stopped waiting for it (e.g., timed out, or system under high load). + The symbolic names of this enumeration are used throughout this specification, but on all interfaces (HTTPS API, System API), they are represented as positive numbers as given in the list above. The error message is guaranteed to be a string, i.e. not arbitrary binary data. @@ -1411,6 +1417,7 @@ The following sections describe various System API functions, also referred to a ic0.msg_reject_code : () -> i32; // Ry Rt CRy CRt ic0.msg_reject_msg_size : () -> i32; // Rt CRt ic0.msg_reject_msg_copy : (dst : i32, offset : i32, size : i32) -> (); // Rt CRt + ic0.msg_deadline : () -> i64; // U Q CQ Ry Rt CRy CRt ic0.msg_reply_data_append : (src : i32, size : i32) -> (); // U RQ NRQ CQ Ry Rt CRy CRt ic0.msg_reply : () -> (); // U RQ NRQ CQ Ry Rt CRy CRt @@ -1449,6 +1456,7 @@ The following sections describe various System API functions, also referred to a ) -> (); ic0.call_on_cleanup : (fun : i32, env : i32) -> (); // U CQ Ry Rt CRy CRt T ic0.call_data_append : (src : i32, size : i32) -> (); // U CQ Ry Rt CRy CRt T + ic0.call_with_best_effort_response : (timeout_seconds : i32) -> (); // U CQ Ry Rt CRy CRt T ic0.call_cycles_add : (amount : i64) -> (); // U Ry Rt T ic0.call_cycles_add128 : (amount_high : i64, amount_low: i64) -> (); // U Ry Rt T ic0.call_perform : () -> ( err_code : i32 ); // U CQ Ry Rt CRy CRt T @@ -1554,6 +1562,12 @@ The canister can access an argument. For `canister_init`, `canister_post_upgrade The reject message. Traps if there is no reject message (i.e. if `reject_code` is `0`). +- `ic0.msg_deadline : () -> i64` + + The deadline, in nanoseconds since 1970-01-01, after which the caller might stop waiting for a response. + + For calls to update methods with best-effort responses and their callbacks, the deadline is computed based on the time the call was made, and the `timeout_seconds` parameter provided by the caller. For other calls (ingress messages and all calls to query and composite query methods, including calls in replicated mode) a deadline of 0 is returned. + ### Responding {#responding} Eventually, the canister will want to respond to the original call, either by replying (indicating success) or rejecting (signalling an error): @@ -1678,6 +1692,16 @@ If this traps (e.g. runs out of cycles), the state changes from the `cleanup` fu There must be at most one call to `ic0.call_on_cleanup` between `ic0.call_new` and `ic0.call_perform`. +- `ic0.call_with_best_effort_response : (timeout_seconds : i32) -> ()` + + Relaxes the response delivery guarantee to be best effort, asking the system to respond at the latest after `timeout_seconds` have elapsed. Best effort means the system may also respond with a `SYS_UNKNOWN` reject code, signifying that the call **may or may not** have been processed by the callee. Then, even if the callee produces a response, it will not be delivered to the caller. + + Any value for `timeout_seconds` is permitted, but is silently bounded from above by the `MAX_CALL_TIMEOUT` system constant; i.e., larger timeouts are treated as equivalent to `MAX_CALL_TIMEOUT` and do not cause an error. The implementation may add a specific [error code](#error-codes) to a reject message to indicate the cause, in particular whether the timeout expired. Note that the reject callback may be executed (possibly significantly) later than the specified time (e.g., if the caller is under high load), or before timeout expiration (e.g., if the system is under load). + + A caller that receives a `SYS_UNKNOWN` code, yet needs to learn the call outcome, must find an out-of-band way of doing so. For example, if the callee provides idempotent function calls, the caller can simply retry the call. Sample causes of `SYS_UNKNOWN` include the call not being delivered in time, call processing not completing in time, reply delivery taking too long, and the system shedding load. + + This method can be called only in between `ic0.call_new` and `ic0.call_perform`, and at most once at that. Otherwise, it traps. A different timeout can be specified for each call. + - `ic0.call_data_append : (src : i32, size : i32) -> ()` Appends the specified bytes to the argument of the call. Initially, the argument is empty. Traps if the total appended data exceeds the [maximum inter-canister call payload](https://internetcomputer.org/docs/current/developer-docs/backend/resource-limits#resource-constraints-and-limits). @@ -3148,6 +3172,7 @@ The [WebAssembly System API](#system-api) is relatively low-level, and some of i arg: Blob; transferred_cycles: Nat; callback: Callback; + timeout_seconds : NoTimeout | Nat; } UpdateFunc = WasmState -> Trap { cycles_used : Nat; } | Return { @@ -3198,12 +3223,12 @@ The [WebAssembly System API](#system-api) is relatively low-level, and some of i new_global_timer : NoGlobalTimer | Nat; cycles_used : Nat; } - update_methods : MethodName ↦ ((Arg, CallerId, Env, AvailableCycles) -> UpdateFunc) + update_methods : MethodName ↦ ((Arg, CallerId, Deadline, Env, AvailableCycles) -> UpdateFunc) query_methods : MethodName ↦ ((Arg, CallerId, Env) -> QueryFunc) composite_query_methods : MethodName ↦ ((Arg, CallerId, Env) -> CompositeQueryFunc) heartbeat : (Env) -> SystemTaskFunc global_timer : (Env) -> SystemTaskFunc - callbacks : (Callback, Response, RefundedCycles, Env, AvailableCycles) -> UpdateFunc + callbacks : (Callback, Response, Deadline, RefundedCycles, Env, AvailableCycles) -> UpdateFunc composite_callbacks : (Callback, Response, Env) -> UpdateFunc inspect_message : (MethodName, WasmState, Arg, CallerId, Env) -> Trap | Return { status : Accept | Reject; @@ -3247,6 +3272,7 @@ CallOrigin | FromCanister { calling_context : CallId; callback: Callback; + deadline : NoDeadline | Timestamp | Expired Timestamp; } | FromSystemTask CallCtxt = { @@ -3608,12 +3634,13 @@ The following is an incomplete list of invariants that should hold for the abstr ∀ (_ ↦ Ctxt) ∈ S.call_contexts: S.canister_status[Ctxt.canister] ≠ Stopped ``` -- Referenced call contexts exist: +- Referenced call contexts exist, unless the origins have expired deadlines: + ``` - ∀ CallMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ∈ dom(S.call_contexts) - ∀ ResponseMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ∈ dom(S.call_contexts) - ∀ (_ ↦ {needs_to_respond = true, origin = FromCanister O, …}) ∈ S.call_contexts: O.calling_context ∈ dom(S.call_contexts) - ∀ (_ ↦ Stopping Origins) ∈ S.canister_status: ∀(FromCanister O, _) ∈ Origins. O.calling_context ∈ dom(S.call_contexts) + ∀ CallMessage {origin = FromCanister O, …} ∈ S.messages. O.deadline ≠ Expired _ ⇒ O.calling_context ∈ dom(S.call_contexts) + ∀ ResponseMessage {origin = FromCanister O, …} ∈ S.messages. O.deadline ≠ Expired _ ⇒ O.calling_context ∈ dom(S.call_contexts) + ∀ (_ ↦ {needs_to_respond = true, origin = FromCanister O, …}) ∈ S.call_contexts: O.deadline ≠ Expired _ ⇒ O.calling_context ∈ dom(S.call_contexts) + ∀ (_ ↦ Stopping Origins) ∈ S.canister_status: ∀(FromCanister O, _) ∈ Origins. O.deadline ≠ Expired _ ⇒ O.calling_context ∈ dom(S.call_contexts) ``` ### State transitions @@ -3992,6 +4019,17 @@ Note that new messages are executed only if the canister is Running and is not f The transition models the actual execution of a message, whether it is an initial call to a public method or a response. In either case, a call context already exists (see transition "Call context creation"). +For convenience, we first define a function that extracts the deadline from a call context. Note that user and system messages have no deadline. + +```html + +deadline_of_context(ctxt) = match ctxt.origin with + FromCanister O if O.deadline ≠ Expired _ → O.deadline + FromCanister O if O.deadline = Expired ts → ts + otherwise → NoDeadline + +``` + Conditions ```html @@ -4000,6 +4038,8 @@ S.messages = Older_messages · FuncMessage M · Younger_messages (M.queue = Unordered) or (∀ CallMessage M' | FuncMessage M' ∈ Older_messages. M'.queue ≠ M.queue) S.canisters[M.receiver] ≠ EmptyCanister Mod = S.canisters[M.receiver].module +Ctxt = S.call_contexts[M.call_context] +Deadline = deadline_of_context(Ctxt) Is_response = M.entry_point == Callback _ _ _ @@ -4023,9 +4063,9 @@ Env = { canister_version = S.canister_version[M.receiver]; } -Available = S.call_contexts[M.call_contexts].available_cycles +Available = Ctxt.available_cycles ( M.entry_point = PublicMethod Name Caller Arg - F = Mod.update_methods[Name](Arg, Caller, Env, Available) + F = Mod.update_methods[Name](Arg, Caller, Deadline, Env, Available) New_canister_version = S.canister_version[M.receiver] + 1 Wasm_memory_limit = S.wasm_memory_limit[M.receiver] ) @@ -4037,7 +4077,7 @@ or ) or ( M.entry_point = Callback Callback Response RefundedCycles - F = Mod.callbacks(Callback, Response, RefundedCycles, Env, Available) + F = Mod.callbacks(Callback, Response, Deadline, RefundedCycles, Env, Available) New_canister_version = S.canister_version[M.receiver] + 1 Wasm_memory_limit = 0 ) @@ -4098,7 +4138,7 @@ if memory_usage_chunk_store(S.chunk_store[M.receiver]) + memory_usage_snapshot(S.snapshots[M.receiver]) ≤ S.memory_allocation[M.receiver]) (Wasm_memory_limit = 0) or |res.new_state.store.mem| <= Wasm_memory_limit - (res.response = NoResponse) or S.call_contexts[M.call_context].needs_to_respond + (res.response = NoResponse) or Ctxt.needs_to_respond then S with canisters[M.receiver].wasm_state = res.new_state; @@ -4110,6 +4150,10 @@ then origin = FromCanister { call_context = M.call_context; callback = call.callback; + deadline = if call.timeout_seconds ≠ NoTimeout + then S.time[M.receiver] + call.timeout_seconds * 10^9 + else NoDeadline + }; caller = M.receiver; callee = call.callee; @@ -4120,7 +4164,7 @@ then } | call ∈ res.new_calls ] · [ ResponseMessage { - origin = S.call_contexts[M.call_context].origin + origin = Ctxt.origin; response = res.response; refunded_cycles = Available - res.cycles_accepted; } @@ -4231,6 +4275,50 @@ S.messages = } ``` +#### Call expiry {#call-expiry} + +These transitions expire calls with best-effort responses. The transition can be taken before the specified call deadline (e.g., due to high system load), and we thus ignore the caller time in these transitions. We define two variants of the transition, one that expires messages, and one that expires calls that are in progress (i.e., have open downstream call contexts). + +The first transition defines the expiry of messages. + +```html +S.messages = Older_messages · M · Younger_messages +M = CallMessage _ ∨ M = ResponseMessage _ +M.origin = FromCanister O +O.deadline = timestamp +``` + +State after +```html +S.messages = Older_messages · (M with origin = FromCanister O with deadline = Expired timestamp) · Younger_messages · + ResponseMessage { + origin = FromCanister O with deadline = NoDeadline; + response = Reject (SYS_UNKNOWN, ); + refunded_cycles = 0; + } +``` + +The next transition defines the expiry of calls that are being processed by the callee. + +Condition +```html +ctxt_id ∈ S.call_contexts +S.call_contexts[ctxt_id].origin = FromCanister O +S.call_contexts[ctxt_id].needs_to_respond = true +O.deadline = timestamp +``` + +State after + +```html +S.call_contexts[ctxt_id].origin = FromCanister O with deadline = Expired timestamp +S.messages = S.messages · ResponseMessage { + origin = FromCanister O with deadline = NoDeadline; + response = Reject (SYS_UNKNOWN, ); + refunded_cycles = 0; +} +``` + #### Call context starvation {#rule-starvation} If the call context needs to respond (in particular, if the call context is not for a system task) and there is no call, downstream call context, or response that references a call context, then a reject is synthesized. The error message below is *not* indicative. In particular, if the IC has an idea about *why* this starved, it can put that in there (e.g. the initial message handler trapped with an out-of-memory access). @@ -4240,10 +4328,10 @@ Conditions ```html S.call_contexts[Ctxt_id].needs_to_respond = true -∀ CallMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ≠ Ctxt_id -∀ ResponseMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ≠ Ctxt_id -∀ (_ ↦ {needs_to_respond = true, origin = FromCanister O, …}) ∈ S.call_contexts: O.calling_context ≠ Ctxt_id -∀ (_ ↦ Stopping Origins) ∈ S.canister_status: ∀(FromCanister O, _) ∈ Origins. O.calling_context ≠ Ctxt_id +∀ CallMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ≠ Ctxt_id ∨ O.deadline = Expired _ +∀ ResponseMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ≠ Ctxt_id ∨ O.deadline = Expired _ +∀ (_ ↦ {needs_to_respond = true, origin = FromCanister O, …}) ∈ S.call_contexts: O.calling_context ≠ Ctxt_id ∨ O.deadline = Expired _ +∀ (_ ↦ Stopping Origins) ∈ S.canister_status: ∀(FromCanister O, _) ∈ Origins. O.calling_context ≠ Ctxt_id ∨ O.deadline = Expired _ ``` @@ -4259,7 +4347,7 @@ S with ResponseMessage { origin = S.call_contexts[Ctxt_id].origin; response = Reject (CANISTER_ERROR, ); - refunded_cycles = S.call_contexts[Ctxt_id].available_cycles + refunded_cycles = S.call_contexts[Ctxt_id].available_cycles; } ``` @@ -4273,10 +4361,10 @@ Conditions ```html S.call_contexts[Ctxt_id].needs_to_respond = false -∀ CallMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ≠ Ctxt_id -∀ ResponseMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ≠ Ctxt_id -∀ (_ ↦ {needs_to_respond = true, origin = FromCanister O, …}) ∈ S.call_contexts: O.calling_context ≠ Ctxt_id -∀ (_ ↦ Stopping Origins) ∈ S.canister_status: ∀(FromCanister O, _) ∈ Origins. O.calling_context ≠ Ctxt_id +∀ CallMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ≠ Ctxt_id ∨ O.deadline = Expired _ +∀ ResponseMessage {origin = FromCanister O, …} ∈ S.messages. O.calling_context ≠ Ctxt_id ∨ O.deadline = Expired _ +∀ (_ ↦ {needs_to_respond = true, origin = FromCanister O, …}) ∈ S.call_contexts: O.calling_context ≠ Ctxt_id ∨ O.deadline = Expired _ +∀ (_ ↦ Stopping Origins) ∈ S.canister_status: ∀(FromCanister O, _) ∈ Origins. O.calling_context ≠ Ctxt_id ∨ O.deadline = Expired _ ``` @@ -5830,9 +5918,12 @@ S.messages = Older_messages · ResponseMessage RM · Younger_messages RM.origin = FromCanister { call_context = Ctxt_id callback = Callback + deadline = D } +Ctxt_id ∈ dom(S.call_contexts) not S.call_contexts[Ctxt_id].deleted S.call_contexts[Ctxt_id].canister ∈ dom(S.balances) +D ≠ Expired _ ``` @@ -5855,7 +5946,7 @@ S with ``` -If the responded call context does not exist anymore, because the canister has been uninstalled since, the refunded cycles are still added to the canister balance, but no function invocation is enqueued: +If the responded call context does not exist anymore, because the canister has been uninstalled since, the refunded cycles are still added to the canister balance, but no function invocation is enqueued. Conditions @@ -5865,9 +5956,12 @@ S.messages = Older_messages · ResponseMessage RM · Younger_messages RM.origin = FromCanister { call_context = Ctxt_id callback = Callback + deadline = D } +Ctxt_id ∈ dom(S.call_contexts) S.call_contexts[Ctxt_id].deleted S.call_contexts[Ctxt_id].canister ∈ dom(S.balances) +D ≠ Expired _ ``` @@ -5882,6 +5976,22 @@ S with ``` +#### Dropping expired messages {#message-timeout} + +Condition: +```html +S.messages = Older_messages · M · Younger_messages +M = ResponseMessage _ ∨ M = CallMessage _ +M.origin = FromCanister O +O.deadline = Expired _ +``` + +State after + +```html +S.messages = Older_messages · Younger_messages +``` + #### Respond to user request When an ingress method call has been responded to, we can record the response in the list of queries. @@ -6534,6 +6644,7 @@ Params = { sysenv : Env; cycles_refunded : Nat; method_name : NoText | Text; + deadline : NoDeadline | Timestamp; } ExecutionState = { wasm_state : WasmState; @@ -6591,6 +6702,7 @@ liquid_balance(es) = sysenv = (undefined); cycles_refunded = 0; method_name = NoText; + deadline = NoDeadline; } empty_execution_state = { wasm_state = (undefined); @@ -6761,10 +6873,15 @@ Finally, we can specify the abstract `CanisterModule` that models a concrete Web - The partial map `update_methods` of the `CanisterModule` is defined for all method names `method` for which the WebAssembly program exports a function `func` named `canister_update `, and has value ``` - update_methods[method] = λ (arg, caller, sysenv, available) → λ wasm_state → + update_methods[method] = λ (arg, caller, deadline, sysenv, available) → λ wasm_state → let es = ref {empty_execution_state with wasm_state = wasm_state; - params = empty_params with { arg = arg; caller = caller; sysenv } + params = empty_params with { + arg = arg; + caller = caller; + deadline = deadline; + sysenv; + } balance = sysenv.balance cycles_available = available; context = U @@ -6877,10 +6994,11 @@ global_timer = λ (sysenv) → λ wasm_state → Trap {cycles_used = 0;} - The function `callbacks` of the `CanisterModule` is defined as follows ``` - callbacks = λ(callbacks, response, refunded_cycles, sysenv, available) → λ wasm_state → + callbacks = λ(callbacks, response, deadline, refunded_cycles, sysenv, available) → λ wasm_state → let params0 = empty_params with { - sysenv + sysenv; cycles_refunded = refund_cycles; + deadline; } let (fun, env, params, context) = match response with Reply data -> @@ -7072,6 +7190,12 @@ ic0.msg_reject_msg_copy(dst:i32, offset:i32, size:i32) : i32 = if es.context ∉ {Rt, CRt} then Trap {cycles_used = es.cycles_used;} copy_to_canister(dst, offset, size, es.params.reject_msg) +ic0.msg_deadline() : i64 = + if es.context ∉ {U, Q, CQ, Ry, Rt, CRy, CRt} then Trap {cycles_used = es.cycles_used;} + if es.params.deadline = Timestamp t + then return t + else return 0 + ic0.msg_reply_data_append(src : i32, size : i32) = if es.context ∉ {U, RQ, NRQ, CQ, Ry, Rt, CRy, CRt} then Trap {cycles_used = es.cycles_used;} if es.response ≠ NoResponse then Trap {cycles_used = es.cycles_used;} @@ -7204,6 +7328,15 @@ ic0.call_new( }; } +ic0.call_with_best_effort_response(timeout_seconds : i32) = + if + es.context ∉ {U, CQ, Ry, Rt, CRy, CRt, T} + or es.pending_call = NoPendingCall + or es.pending_call.timeout ≠ NoTimeout + then Trap {cycles_used = es.cycles_used;} + es.pending_call.timeout_seconds := min(timeout_seconds, MAX_CALL_TIMEOUT) + + ic0.call_on_cleanup (fun : i32, env : i32) = if es.context ∉ {U, CQ, Ry, Rt, CRy, CRt, T} then Trap {cycles_used = es.cycles_used;} if es.pending_call = NoPendingCall then Trap {cycles_used = es.cycles_used;}