diff --git a/changes/3457.feature.md b/changes/3457.feature.md new file mode 100644 index 00000000000..13811f82382 --- /dev/null +++ b/changes/3457.feature.md @@ -0,0 +1 @@ +Add prometheus client for metrics collection \ No newline at end of file diff --git a/configs/agent/ci.toml b/configs/agent/ci.toml index d8eadfcdb04..2b678be6175 100644 --- a/configs/agent/ci.toml +++ b/configs/agent/ci.toml @@ -7,6 +7,11 @@ password = "" [agent] rpc-listen-addr = { host = "127.0.0.1", port = 6001 } +# metric API service address +service-addr = { host = "0.0.0.0", port = 6003 } +ssl-enabled = false +#ssl-cert = "" +#ssl-key = "" agent-sock-port = 6007 id = "i-travis" scaling-group = "default" diff --git a/configs/agent/halfstack.toml b/configs/agent/halfstack.toml index 79c6231aeee..74cb2c891c1 100644 --- a/configs/agent/halfstack.toml +++ b/configs/agent/halfstack.toml @@ -8,6 +8,11 @@ password = "" [agent] mode = "docker" rpc-listen-addr = { host = "127.0.0.1", port = 6001 } +# metric API service address +service-addr = { host = "0.0.0.0", port = 6003 } +ssl-enabled = false +#ssl-cert = "" +#ssl-key = "" agent-sock-port = 6007 # id = "i-something-special" scaling-group = "default" diff --git a/configs/storage-proxy/halfstack.toml b/configs/storage-proxy/halfstack.toml index 271086dd096..594fbacf805 100644 --- a/configs/storage-proxy/halfstack.toml +++ b/configs/storage-proxy/halfstack.toml @@ -55,7 +55,7 @@ ssl-enabled = false [api.manager] # Manager-facing API -service-addr = { host = "127.0.0.1", port = 6022 } +service-addr = { host = "0.0.0.0", port = 6022 } ssl-enabled = false # ssl-cert = "configs/storage-proxy/ssl/manager-api-selfsigned.cert.pem" # ssl-privkey = "configs/storage-proxy/ssl/manager-api-selfsigned.key.pem" diff --git a/configs/storage-proxy/sample.toml b/configs/storage-proxy/sample.toml index 06469086175..d65cffb5f2a 100644 --- a/configs/storage-proxy/sample.toml +++ b/configs/storage-proxy/sample.toml @@ -67,7 +67,7 @@ ssl-enabled = false [api.manager] # Manager-facing API # Recommended to have SSL and bind on a private IP only accessible by managers -service-addr = { host = "127.0.0.1", port = 6022 } +service-addr = { host = "0.0.0.0", port = 6022 } ssl-enabled = true ssl-cert = "configs/storage-proxy/ssl/manager-api-selfsigned.cert.pem" ssl-privkey = "configs/storage-proxy/ssl/manager-api-selfsigned.key.pem" diff --git a/docs/manager/rest-reference/openapi.json b/docs/manager/rest-reference/openapi.json index 6b888d1ea35..925a6a6d3ac 100644 --- a/docs/manager/rest-reference/openapi.json +++ b/docs/manager/rest-reference/openapi.json @@ -1445,7 +1445,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/events/session": { @@ -1515,7 +1515,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/auth": { @@ -2099,7 +2099,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "folders.delete_by_id", @@ -2155,7 +2155,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "folders.delete_by_name", @@ -2255,7 +2255,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/_/all-hosts": { @@ -2275,7 +2275,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/_/allowed-types": { @@ -2295,7 +2295,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/_/all_hosts": { @@ -2315,7 +2315,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/_/allowed_types": { @@ -2335,7 +2335,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/_/perf-metric": { @@ -2364,7 +2364,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/rename": { @@ -2521,7 +2521,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/request-upload": { @@ -2572,7 +2572,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/request-download": { @@ -2623,7 +2623,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/move-file": { @@ -2674,7 +2674,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/rename-file": { @@ -2729,7 +2729,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/delete-files": { @@ -2783,7 +2783,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "folders.delete_files.2", @@ -2829,7 +2829,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/rename_file": { @@ -2884,7 +2884,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/delete_files": { @@ -2932,7 +2932,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/files": { @@ -2969,7 +2969,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/{name}/invite": { @@ -3402,7 +3402,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/invitations/list_sent": { @@ -3422,7 +3422,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/invitations/update/{inv_id}": { @@ -3494,7 +3494,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/invitations/accept": { @@ -3627,7 +3627,7 @@ "in": "query" } ], - "description": "\nList shared vfolders.\n\nNot available for group vfolders.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nList shared vfolders.\n\nNot available for group vfolders.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "post": { "operationId": "folders.update_shared_vfolder", @@ -3743,7 +3743,7 @@ "in": "query" } ], - "description": "\nReturn the contents of `/etc/fstab` file.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\nReturn the contents of `/etc/fstab` file.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/_/mounts": { @@ -3763,7 +3763,7 @@ } ], "parameters": [], - "description": "\nList all mounted vfolder hosts in vfroot.\n\nAll mounted hosts from connected (ALIVE) agents are also gathered.\nGenerally, agents should be configured to have same hosts structure,\nbut newly introduced one may not.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\nList all mounted vfolder hosts in vfroot.\n\nAll mounted hosts from connected (ALIVE) agents are also gathered.\nGenerally, agents should be configured to have same hosts structure,\nbut newly introduced one may not.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" }, "post": { "operationId": "folders.mount_host", @@ -4001,7 +4001,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "post": { "operationId": "folders.update_quota", @@ -4086,7 +4086,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/folders/_/used-bytes": { @@ -4124,7 +4124,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "//graphql": { @@ -4312,7 +4312,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "post": { "operationId": "services.create", @@ -4409,7 +4409,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/services/{service_id}": { @@ -4445,7 +4445,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "services.delete", @@ -4479,7 +4479,7 @@ } } ], - "description": "\nRemoves model service (and inference sessions for the service also).\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nRemoves model service (and inference sessions for the service also).\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/services/{service_id}/errors": { @@ -4515,7 +4515,7 @@ } } ], - "description": "\nList errors raised while trying to create the inference sessions. Backend.AI will\nstop trying to create an inference session for the model service if six (6) error stacks\nup. The only way to clear the error and retry spawning session is to call\n`clear_error` (POST /services/{service_id}/errors/clear) API.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nList errors raised while trying to create the inference sessions. Backend.AI will\nstop trying to create an inference session for the model service if six (6) error stacks\nup. The only way to clear the error and retry spawning session is to call\n`clear_error` (POST /services/{service_id}/errors/clear) API.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/services/{service_id}/errors/clear": { @@ -4544,7 +4544,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/services/{service_id}/scale": { @@ -4589,7 +4589,7 @@ } } ], - "description": "\nUpdates ideal inference session count manually. Based on the difference of this number,\ninference sessions will be created or removed automatically.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nUpdates ideal inference session count manually. Based on the difference of this number,\ninference sessions will be created or removed automatically.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/services/{service_id}/sync": { @@ -4625,7 +4625,7 @@ } } ], - "description": "\nForce syncs up-to-date model service information with AppProxy.\nIn normal situations this will be automatically handled by Backend.AI schedulers,\nbut this API is left open in case of unexpected restart of AppProxy process.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nForce syncs up-to-date model service information with AppProxy.\nIn normal situations this will be automatically handled by Backend.AI schedulers,\nbut this API is left open in case of unexpected restart of AppProxy process.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/services/{service_id}/routings/{route_id}": { @@ -4678,7 +4678,7 @@ } } ], - "description": "\nUpdates traffic bias of specific route.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nUpdates traffic bias of specific route.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "services.delete_route", @@ -4720,7 +4720,7 @@ } } ], - "description": "\nScales down the service by removing specific inference session.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nScales down the service by removing specific inference session.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/services/{service_id}/token": { @@ -4765,7 +4765,7 @@ } } ], - "description": "\nGenerates a token which acts as an API key to authenticate when calling model service endpoint.\nIf both duration and valid_until is not set then the AppProxy will determine appropriate lifetime of the token.\nduration and valid_until can't be both specified.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nGenerates a token which acts as an API key to authenticate when calling model service endpoint.\nIf both duration and valid_until is not set then the AppProxy will determine appropriate lifetime of the token.\nduration and valid_until can't be both specified.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session": { @@ -5381,7 +5381,7 @@ "in": "query" } ], - "description": "\nA quick session-ID matcher API for use with auto-completion in CLI.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nA quick session-ID matcher API for use with auto-completion in CLI.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/_/sync-agent-registry": { @@ -5484,7 +5484,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "patch": { "operationId": "session.restart", @@ -5527,7 +5527,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "session.destroy", @@ -5580,7 +5580,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "post": { "operationId": "session.execute", @@ -5607,7 +5607,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/_/logs": { @@ -5646,7 +5646,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "get": { "operationId": "session.get_task_logs.2", @@ -5674,7 +5674,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/direct-access-info": { @@ -5703,7 +5703,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/logs": { @@ -5741,7 +5741,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/rename": { @@ -5818,7 +5818,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/complete": { @@ -5847,7 +5847,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/shutdown-service": { @@ -5894,7 +5894,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/upload": { @@ -5923,7 +5923,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/download": { @@ -5995,7 +5995,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/download_single": { @@ -6064,7 +6064,7 @@ } } ], - "description": "\nDownload a single file from the scratch root. Only for small files.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nDownload a single file from the scratch root. Only for small files.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/files": { @@ -6093,7 +6093,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/start-service": { @@ -6154,7 +6154,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/session/{session_name}/commit": { @@ -6348,7 +6348,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/stream/session/{session_name}/pty": { @@ -6377,7 +6377,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/stream/session/{session_name}/execute": { @@ -6406,7 +6406,7 @@ } } ], - "description": "\nWebSocket-version of gateway.kernel.execute().\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nWebSocket-version of gateway.kernel.execute().\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/stream/session/{session_name}/apps": { @@ -6435,7 +6435,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/stream/session/{session_name}/httpproxy": { @@ -6498,7 +6498,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/stream/session/{session_name}/tcpproxy": { @@ -6561,7 +6561,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/manager/status": { @@ -6884,7 +6884,7 @@ } }, "parameters": [], - "description": "\nReturns the list of all resource presets in the current scaling group,\nwith additional information including allocatability of each preset,\namount of total remaining resources, and the current keypair resource limits.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nReturns the list of all resource presets in the current scaling group,\nwith additional information including allocatability of each preset,\namount of total remaining resources, and the current keypair resource limits.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/recalculate-usage": { @@ -6904,7 +6904,7 @@ } ], "parameters": [], - "description": "\nUpdate `keypair_resource_usages` in redis and `agents.c.occupied_slots`.\n\nThose two values are sometimes out of sync. In that case, calling this API\nre-calculates the values for running containers and updates them in DB.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\nUpdate `keypair_resource_usages` in redis and `agents.c.occupied_slots`.\n\nThose two values are sometimes out of sync. In that case, calling this API\nre-calculates the values for running containers and updates them in DB.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/usage/month": { @@ -6945,7 +6945,7 @@ "in": "query" } ], - "description": "\nReturn usage statistics of terminated containers for a specified month.\nThe date/time comparison is done using the configured timezone.\n\n:param group_ids: If not None, query containers only in those groups.\n:param month: The year-month to query usage statistics. ex) \"202006\" to query for Jun 2020\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\nReturn usage statistics of terminated containers for a specified month.\nThe date/time comparison is done using the configured timezone.\n\n:param group_ids: If not None, query containers only in those groups.\n:param month: The year-month to query usage statistics. ex) \"202006\" to query for Jun 2020\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/usage/period": { @@ -6992,7 +6992,7 @@ "in": "query" } ], - "description": "\nReturn usage statistics of terminated containers belonged to the given group for a specified\nperiod in dates.\nThe date/time comparison is done using the configured timezone.\n\n:param project_id: If not None, query containers only in the project.\n:param start_date str: \"yyyymmdd\" format.\n:param end_date str: \"yyyymmdd\" format.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\nReturn usage statistics of terminated containers belonged to the given group for a specified\nperiod in dates.\nThe date/time comparison is done using the configured timezone.\n\n:param project_id: If not None, query containers only in the project.\n:param start_date str: \"yyyymmdd\" format.\n:param end_date str: \"yyyymmdd\" format.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/stats/user/month": { @@ -7012,7 +7012,7 @@ } ], "parameters": [], - "description": "\nReturn time-binned (15 min) stats for terminated user sessions\nover last 30 days.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\nReturn time-binned (15 min) stats for terminated user sessions\nover last 30 days.\n\n\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/stats/admin/month": { @@ -7032,7 +7032,7 @@ } ], "parameters": [], - "description": "\nReturn time-binned (15 min) stats for all terminated sessions\nover last 30 days.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\nReturn time-binned (15 min) stats for all terminated sessions\nover last 30 days.\n\n\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/watcher": { @@ -7061,7 +7061,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/watcher/agent/start": { @@ -7099,7 +7099,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/watcher/agent/stop": { @@ -7137,7 +7137,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/resource/watcher/agent/restart": { @@ -7175,7 +7175,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Superadmin privilege required.\n* Manager status required: RUNNING\n" } }, "/scaling-groups": { @@ -7212,7 +7212,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/scaling-groups/{scaling_group}/wsproxy-version": { @@ -7257,7 +7257,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/template/cluster": { @@ -7308,7 +7308,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "get": { "operationId": "template/cluster.list_template", @@ -7351,7 +7351,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/template/cluster/{template_id}": { @@ -7401,7 +7401,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "put": { "operationId": "template/cluster.put", @@ -7449,7 +7449,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "template/cluster.delete", @@ -7484,7 +7484,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/template/session": { @@ -7535,7 +7535,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "get": { "operationId": "template/session.list_template", @@ -7578,7 +7578,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/template/session/{template_id}": { @@ -7628,7 +7628,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "put": { "operationId": "template/session.put", @@ -7686,7 +7686,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "template/session.delete", @@ -7721,7 +7721,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/user-config/dotfiles": { @@ -7771,7 +7771,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "get": { "operationId": "user-config.list_or_get", @@ -7806,7 +7806,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "patch": { "operationId": "user-config.update", @@ -7854,7 +7854,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "user-config.delete", @@ -7889,7 +7889,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/user-config/bootstrap-script": { @@ -7927,7 +7927,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "get": { "operationId": "user-config.get_bootstrap_script", @@ -7945,7 +7945,7 @@ } ], "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/domain-config/dotfiles": { @@ -7996,7 +7996,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: RUNNING\n" }, "get": { "operationId": "domain-config.list_or_get", @@ -8031,7 +8031,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "patch": { "operationId": "domain-config.update", @@ -8080,7 +8080,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "domain-config.delete", @@ -8115,7 +8115,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: RUNNING\n" } }, "/group-config/dotfiles": { @@ -8177,7 +8177,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: RUNNING\n" }, "get": { "operationId": "group-config.list_or_get", @@ -8228,7 +8228,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "patch": { "operationId": "group-config.update", @@ -8288,7 +8288,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: RUNNING\n" }, "delete": { "operationId": "group-config.delete", @@ -8339,7 +8339,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* Admin privilege required.\n* Manager status required: RUNNING\n" } }, "/logs/error": { @@ -8411,7 +8411,7 @@ } }, "parameters": [], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" }, "get": { "operationId": "logs/error.list_logs", @@ -8456,7 +8456,7 @@ "in": "query" } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } }, "/logs/error/{log_id}/clear": { @@ -8485,7 +8485,7 @@ } } ], - "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: FROZEN\n" + "description": "\n**Preconditions:**\n* User privilege required.\n* Manager status required: RUNNING\n" } } } diff --git a/python.lock b/python.lock index fee8bfa3ecf..fd7a793299e 100644 --- a/python.lock +++ b/python.lock @@ -66,6 +66,7 @@ // "networkx~=3.3.0", // "packaging>=24.1", // "pexpect~=4.8", +// "prometheus-client~=0.21.1", // "psutil~=6.0", // "pycryptodome>=3.20.0", // "pydantic~=2.9.2", @@ -117,6 +118,7 @@ "allow_wheels": true, "build_isolation": true, "constraints": [], + "elide_unused_requires_dist": false, "excluded": [], "locked_resolves": [ { @@ -1043,36 +1045,36 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "588ab05e2771c50fca5c242be14e7a25200ffd3dd95c45950ce40993473864c7", - "url": "https://files.pythonhosted.org/packages/15/8d/b2a330955817b5cb85cee33ca641424d1894fc73258b8e929e3b9719ea22/boto3-1.35.87-py3-none-any.whl" + "hash": "d0224e1499d7189b47aa7f469d96522d98df6f5702fccb20a95a436582ebcd9d", + "url": "https://files.pythonhosted.org/packages/64/e4/e0aba5c388e189a145d13699f59e2ec71600eab0bd545f412d94c130e183/boto3-1.35.98-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "341c58602889078a4a25dc4331b832b5b600a33acd73471d2532c6f01b16fbb4", - "url": "https://files.pythonhosted.org/packages/80/08/cf2a60bcb6d49764379d78e87f29310458257eb413bb7aa85ebe3d8cd0cc/boto3-1.35.87.tar.gz" + "hash": "4b6274b4fe9d7113f978abea66a1f20c8a397c268c9d1b2a6c96b14a256da4a5", + "url": "https://files.pythonhosted.org/packages/aa/5b/60bf8560df0688ea97098c935a5ecdd595e742891ed5e85ebe7257a0ee5d/boto3-1.35.98.tar.gz" } ], "project_name": "boto3", "requires_dists": [ - "botocore<1.36.0,>=1.35.87", + "botocore<1.36.0,>=1.35.98", "botocore[crt]<2.0a0,>=1.21.0; extra == \"crt\"", "jmespath<2.0.0,>=0.7.1", "s3transfer<0.11.0,>=0.10.0" ], "requires_python": ">=3.8", - "version": "1.35.87" + "version": "1.35.98" }, { "artifacts": [ { "algorithm": "sha256", - "hash": "81cf84f12030d9ab3829484b04765d5641697ec53c2ac2b3987a99eefe501692", - "url": "https://files.pythonhosted.org/packages/ae/12/5329e758bb786ef38292e4caafed6cfc5171a758b3311c55b71cd432267d/botocore-1.35.87-py3-none-any.whl" + "hash": "4f1c0b687488663a774ad3a5e81a5f94fae1bcada2364cfdc48482c4dbf794d5", + "url": "https://files.pythonhosted.org/packages/fd/d5/bb969f907b17e03f8169df1e5c548d6719115f1a1386529c6f8776100fb0/botocore-1.35.98-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "3062d073ce4170a994099270f469864169dc1a1b8b3d4a21c14ce0ae995e0f89", - "url": "https://files.pythonhosted.org/packages/98/8d/2e49e7a99944cbeef4c1182f59af282fcb164feef35dfa420500c4e0ccb3/botocore-1.35.87.tar.gz" + "hash": "d11742b3824bdeac3c89eeeaf5132351af41823bbcef8fc15e95c8250b1de09c", + "url": "https://files.pythonhosted.org/packages/35/af/081b065f46ecb9a930a1964d1b7d275e408a03cb45ff7cd89128481dc986/botocore-1.35.98.tar.gz" } ], "project_name": "botocore", @@ -1084,7 +1086,7 @@ "urllib3<1.27,>=1.25.4; python_version < \"3.10\"" ], "requires_python": ">=3.8", - "version": "1.35.87" + "version": "1.35.98" }, { "artifacts": [ @@ -2582,31 +2584,32 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "bcaf2d6fd74fb1459f8450e85d994997ad3e70036452cbfa4ab685acb19479b3", - "url": "https://files.pythonhosted.org/packages/64/38/8d37b19f6c882482cae7ba8db6d02fce3cba7b3895c93fc80352b30a18f5/marshmallow-3.23.2-py3-none-any.whl" + "hash": "ec5d00d873ce473b7f2ffcb7104286a376c354cab0c2fa12f5573dab03e87210", + "url": "https://files.pythonhosted.org/packages/8e/25/5b300f0400078d9783fbe44d30fedd849a130fc3aff01f18278c12342b6f/marshmallow-3.25.1-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "c448ac6455ca4d794773f00bae22c2f351d62d739929f761dce5eacb5c468d7f", - "url": "https://files.pythonhosted.org/packages/ac/0f/33b98679f185f5ce58620595b32d4cf8e2fa5fb56d41eb463826558265c6/marshmallow-3.23.2.tar.gz" + "hash": "f4debda3bb11153d81ac34b0d582bf23053055ee11e791b54b4b35493468040a", + "url": "https://files.pythonhosted.org/packages/b8/85/43b8e95251312e8d0d3389263e87e368a5a015db475e140d5dd8cb8dcb47/marshmallow-3.25.1.tar.gz" } ], "project_name": "marshmallow", "requires_dists": [ - "alabaster==1.0.0; extra == \"docs\"", "autodocsumm==0.2.14; extra == \"docs\"", + "furo==2024.8.6; extra == \"docs\"", "marshmallow[tests]; extra == \"dev\"", "packaging>=17.0", "pre-commit<5.0,>=3.5; extra == \"dev\"", "pytest; extra == \"tests\"", "simplejson; extra == \"tests\"", + "sphinx-copybutton==0.5.2; extra == \"docs\"", "sphinx-issues==5.0.0; extra == \"docs\"", - "sphinx-version-warning==1.1.2; extra == \"docs\"", "sphinx==8.1.3; extra == \"docs\"", + "sphinxext-opengraph==0.9.1; extra == \"docs\"", "tox; extra == \"dev\"" ], "requires_python": ">=3.9", - "version": "3.23.2" + "version": "3.25.1" }, { "artifacts": [ @@ -3003,6 +3006,26 @@ "requires_python": ">=3.8", "version": "1.5.0" }, + { + "artifacts": [ + { + "algorithm": "sha256", + "hash": "594b45c410d6f4f8888940fe80b5cc2521b305a1fafe1c58609ef715a001f301", + "url": "https://files.pythonhosted.org/packages/ff/c2/ab7d37426c179ceb9aeb109a85cda8948bb269b7561a0be870cc656eefe4/prometheus_client-0.21.1-py3-none-any.whl" + }, + { + "algorithm": "sha256", + "hash": "252505a722ac04b0456be05c05f75f45d760c2911ffc45f2a06bcaed9f3ae3fb", + "url": "https://files.pythonhosted.org/packages/62/14/7d0f567991f3a9af8d1cd4f619040c93b68f09a02b6d0b6ab1b2d1ded5fe/prometheus_client-0.21.1.tar.gz" + } + ], + "project_name": "prometheus-client", + "requires_dists": [ + "twisted; extra == \"twisted\"" + ], + "requires_python": ">=3.8", + "version": "0.21.1" + }, { "artifacts": [ { @@ -3414,13 +3437,13 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", - "url": "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl" + "hash": "9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", + "url": "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199", - "url": "https://files.pythonhosted.org/packages/8e/62/8336eff65bcbc8e4cb5d05b55faf041285951b6e80f33e2bff2024788f31/pygments-2.18.0.tar.gz" + "hash": "61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", + "url": "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz" } ], "project_name": "pygments", @@ -3428,7 +3451,7 @@ "colorama>=0.4.6; extra == \"windows-terminal\"" ], "requires_python": ">=3.8", - "version": "2.18.0" + "version": "2.19.1" }, { "artifacts": [ @@ -4584,19 +4607,19 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "0657a4ff8411a030a2116a196e8e008ea679696b5b1a8e1a6aa8ebb737b34688", - "url": "https://files.pythonhosted.org/packages/4b/04/1cc4fffeb4ace85c205e84cd48eb12cb37ec6ffb68245b7eef8f2086d490/types_PyYAML-6.0.12.20241221-py3-none-any.whl" + "hash": "fa4d32565219b68e6dee5f67534c722e53c00d1cfc09c435ef04d7353e1e96e6", + "url": "https://files.pythonhosted.org/packages/e8/c1/48474fbead512b70ccdb4f81ba5eb4a58f69d100ba19f17c92c0c4f50ae6/types_PyYAML-6.0.12.20241230-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "4f149aa893ff6a46889a30af4c794b23833014c469cc57cbc3ad77498a58996f", - "url": "https://files.pythonhosted.org/packages/f4/60/ba3f23024bdd406e65c359b9dbd9757f058986bd57d94f6639015f9a9fae/types_pyyaml-6.0.12.20241221.tar.gz" + "hash": "7f07622dbd34bb9c8b264fe860a17e0efcad00d50b5f27e93984909d9363498c", + "url": "https://files.pythonhosted.org/packages/9a/f9/4d566925bcf9396136c0a2e5dc7e230ff08d86fa011a69888dd184469d80/types_pyyaml-6.0.12.20241230.tar.gz" } ], "project_name": "types-pyyaml", "requires_dists": [], "requires_python": ">=3.8", - "version": "6.0.12.20241221" + "version": "6.0.12.20241230" }, { "artifacts": [ @@ -4623,19 +4646,19 @@ "artifacts": [ { "algorithm": "sha256", - "hash": "7cbfd3bf2944f88bbcdd321b86ddd878232a277be95d44c78a53585d78ebc2f6", - "url": "https://files.pythonhosted.org/packages/41/2f/051d5d23711209d4077d95c62fa8ef6119df7298635e3a929e50376219d1/types_setuptools-75.6.0.20241223-py3-none-any.whl" + "hash": "a9f12980bbf9bcdc23ecd80755789085bad6bfce4060c2275bc2b4ca9f2bc480", + "url": "https://files.pythonhosted.org/packages/cf/a3/dbfd106751b11c728cec21cc62cbfe7ff7391b935c4b6e8f0bdc2e6fd541/types_setuptools-75.8.0.20250110-py3-none-any.whl" }, { "algorithm": "sha256", - "hash": "d9478a985057ed48a994c707f548e55aababa85fe1c9b212f43ab5a1fffd3211", - "url": "https://files.pythonhosted.org/packages/53/48/a89068ef20e3bbb559457faf0fd3c18df6df5df73b4b48ebf466974e1f54/types_setuptools-75.6.0.20241223.tar.gz" + "hash": "96f7ec8bbd6e0a54ea180d66ad68ad7a1d7954e7281a710ea2de75e355545271", + "url": "https://files.pythonhosted.org/packages/f7/42/5713e90d4f9683f2301d900f33e4fc2405ad8ac224dda30f6cb7f4cd215b/types_setuptools-75.8.0.20250110.tar.gz" } ], "project_name": "types-setuptools", "requires_dists": [], "requires_python": ">=3.8", - "version": "75.6.0.20241223" + "version": "75.8.0.20250110" }, { "artifacts": [ @@ -4989,7 +5012,7 @@ "only_wheels": [], "overridden": [], "path_mappings": {}, - "pex_version": "2.10.0", + "pex_version": "2.28.1", "pip_version": "24.1.2", "prefer_older_binary": false, "requirements": [ @@ -5050,6 +5073,7 @@ "networkx~=3.3.0", "packaging>=24.1", "pexpect~=4.8", + "prometheus-client~=0.21.1", "psutil~=6.0", "pycryptodome>=3.20.0", "pydantic~=2.9.2", @@ -5098,5 +5122,6 @@ "mac" ], "transitive": true, - "use_pep517": null + "use_pep517": null, + "use_system_time": false } diff --git a/requirements.txt b/requirements.txt index 9e0605d909f..9ab9a9a513a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,6 +47,7 @@ multidict>=6.1 namedlist~=1.8 networkx~=3.3.0 pexpect~=4.8 +prometheus-client~=0.21.1 psutil~=6.0 pycryptodome>=3.20.0 pyhumps~=3.8.0 diff --git a/src/ai/backend/account_manager/server.py b/src/ai/backend/account_manager/server.py index 6105a02fee7..791a5e7645c 100644 --- a/src/ai/backend/account_manager/server.py +++ b/src/ai/backend/account_manager/server.py @@ -31,6 +31,11 @@ from setproctitle import setproctitle from ai.backend.common.etcd import AsyncEtcd, ConfigScopes +from ai.backend.common.metrics.http import ( + build_api_metric_middleware, + build_prometheus_metrics_handler, +) +from ai.backend.common.metrics.metric import CommonMetricRegistry from ai.backend.common.msgpack import DEFAULT_PACK_OPTS, DEFAULT_UNPACK_OPTS from ai.backend.common.types import HostPortPair from ai.backend.common.utils import env_info @@ -185,10 +190,12 @@ def build_root_app( subapp_pkgs: Optional[Sequence[str]] = None, scheduler_opts: Optional[Mapping[str, Any]] = None, ) -> web.Application: + metric_registry = CommonMetricRegistry.instance() app = web.Application( middlewares=[ exception_middleware, api_middleware, + build_api_metric_middleware(metric_registry.api), ] ) @@ -257,7 +264,9 @@ async def _call_cleanup_context_shutdown_handlers(app: web.Application) -> None: # should be done in create_app() in other modules. cors.add(app.router.add_route("GET", r"", hello)) cors.add(app.router.add_route("GET", r"/", hello)) - + cors.add( + app.router.add_route("GET", r"/metrics", build_prometheus_metrics_handler(metric_registry)) + ) return app diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 2b80b980f0b..8b3b80065f5 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -102,6 +102,7 @@ from ai.backend.common.events_experimental import EventDispatcher as ExperimentalEventDispatcher from ai.backend.common.exception import VolumeMountFailed from ai.backend.common.lock import FileLock +from ai.backend.common.metrics.metric import CommonMetricRegistry from ai.backend.common.plugin.monitor import ErrorPluginContext, StatsPluginContext from ai.backend.common.service_ports import parse_service_ports from ai.backend.common.types import ( @@ -613,6 +614,7 @@ class AbstractAgent( _pending_creation_tasks: Dict[KernelId, Set[asyncio.Task]] _ongoing_exec_batch_tasks: weakref.WeakSet[asyncio.Task] _ongoing_destruction_tasks: weakref.WeakValueDictionary[KernelId, asyncio.Task] + _metric_registry: CommonMetricRegistry def __init__( self, @@ -651,6 +653,7 @@ def __init__( self._pending_creation_tasks = defaultdict(set) self._ongoing_exec_batch_tasks = weakref.WeakSet() self._ongoing_destruction_tasks = weakref.WeakValueDictionary() + self._metric_registry = CommonMetricRegistry.instance() async def __ainit__(self) -> None: """ @@ -678,6 +681,7 @@ async def __ainit__(self) -> None: log_events=self.local_config["debug"]["log-events"], node_id=self.local_config["agent"]["id"], consumer_group=EVENT_DISPATCHER_CONSUMER_GROUP, + event_observer=self._metric_registry.event, ) self.redis_stream_pool = redis_helper.get_redis_object( self.local_config["redis"], @@ -690,7 +694,10 @@ async def __ainit__(self) -> None: db=REDIS_STAT_DB, ) - self.background_task_manager = BackgroundTaskManager(self.event_producer) + self.background_task_manager = BackgroundTaskManager( + self.event_producer, + bgtask_observer=self._metric_registry.bgtask, + ) alloc_map_mod.log_alloc_map = self.local_config["debug"]["log-alloc-map"] computers = await self.load_resources() diff --git a/src/ai/backend/agent/config.py b/src/ai/backend/agent/config.py index bfbfeb33280..d836a998036 100644 --- a/src/ai/backend/agent/config.py +++ b/src/ai/backend/agent/config.py @@ -27,6 +27,10 @@ t.Key("agent"): t.Dict({ tx.AliasedKey(["backend", "mode"]): tx.Enum(AgentBackend), t.Key("rpc-listen-addr", default=("", 6001)): tx.HostPortPair(allow_blank_host=True), + t.Key("service-addr", default=("0.0.0.0", 6003)): tx.HostPortPair, + t.Key("ssl-enabled", default=False): t.Bool, + t.Key("ssl-cert", default=None): t.Null | tx.Path(type="file"), + t.Key("ssl-key", default=None): t.Null | tx.Path(type="file"), t.Key("advertised-rpc-addr", default=None): t.Null | tx.HostPortPair, t.Key("rpc-auth-manager-public-key", default=None): t.Null | tx.Path(type="file"), t.Key("rpc-auth-agent-keypair", default=None): t.Null | tx.Path(type="file"), diff --git a/src/ai/backend/agent/server.py b/src/ai/backend/agent/server.py index 52b4c147265..8ec1d51a59b 100644 --- a/src/ai/backend/agent/server.py +++ b/src/ai/backend/agent/server.py @@ -10,6 +10,7 @@ import os.path import shutil import signal +import ssl import sys from collections import OrderedDict, defaultdict from datetime import datetime, timezone @@ -34,10 +35,12 @@ ) from uuid import UUID +import aiohttp_cors import aiomonitor import aiotools import click import tomlkit +from aiohttp import web from aiotools import aclosing from callosum.lower.zeromq import ZeroMQAddress, ZeroMQRPCTransport from callosum.ordering import ExitOrderedAsyncScheduler @@ -59,6 +62,11 @@ KernelLifecycleEventReason, KernelTerminatedEvent, ) +from ai.backend.common.metrics.http import ( + build_api_metric_middleware, + build_prometheus_metrics_handler, +) +from ai.backend.common.metrics.metric import CommonMetricRegistry from ai.backend.common.types import ( AutoPullBehavior, ClusterInfo, @@ -943,6 +951,27 @@ async def server_main_logwrapper( yield +def build_root_server() -> web.Application: + metric_registry = CommonMetricRegistry.instance() + app = web.Application( + middlewares=[ + build_api_metric_middleware(metric_registry.api), + ], + ) + cors = aiohttp_cors.setup( + app, + defaults={ + "*": aiohttp_cors.ResourceOptions( + allow_credentials=False, expose_headers="*", allow_headers="*" + ), + }, + ) + cors.add( + app.router.add_route("GET", r"/metrics", build_prometheus_metrics_handler(metric_registry)) + ) + return app + + @aiotools.server async def server_main( loop: asyncio.AbstractEventLoop, @@ -1054,6 +1083,28 @@ async def server_main( agent_instance = agent monitor.console_locals["agent"] = agent + app = build_root_server() + runner = web.AppRunner(app) + await runner.setup() + service_addr = local_config["agent"]["service-addr"] + ssl_ctx = None + if local_config["agent"]["ssl-enabled"]: + ssl_ctx = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + ssl_ctx.load_cert_chain( + str(local_config["agent"]["ssl-cert"]), + str(local_config["agent"]["ssl-privkey"]), + ) + site = web.TCPSite( + runner, + str(service_addr.host), + service_addr.port, + backlog=1024, + reuse_port=True, + ssl_context=ssl_ctx, + ) + await site.start() + log.info("started serving HTTP at {}", service_addr) + # Run! try: async with agent: diff --git a/src/ai/backend/common/bgtask.py b/src/ai/backend/common/bgtask.py index 1a9a9df78fe..73ddb427e60 100644 --- a/src/ai/backend/common/bgtask.py +++ b/src/ai/backend/common/bgtask.py @@ -16,6 +16,7 @@ DefaultDict, Final, Literal, + Protocol, Set, Type, TypeAlias, @@ -108,17 +109,38 @@ async def _pipe_builder(r: Redis) -> Pipeline: BackgroundTask = Callable[Concatenate[ProgressReporter, ...], Awaitable[str | None]] +class BackgroundTaskObserver(Protocol): + def observe_bgtask_started(self, *, task_name: str) -> None: ... + def observe_bgtask_done(self, *, task_name: str, status: str, duration: float) -> None: ... + + +class NopBackgroundTaskObserver: + def observe_bgtask_started(self, *, task_name: str) -> None: + pass + + def observe_bgtask_done(self, *, task_name: str, status: str, duration: float) -> None: + pass + + class BackgroundTaskManager: event_producer: EventProducer ongoing_tasks: weakref.WeakSet[asyncio.Task] task_update_queues: DefaultDict[uuid.UUID, Set[asyncio.Queue[Sentinel | BgtaskEvents]]] dict_lock: asyncio.Lock - def __init__(self, event_producer: EventProducer) -> None: + _metric_observer: BackgroundTaskObserver + + def __init__( + self, + event_producer: EventProducer, + *, + bgtask_observer: BackgroundTaskObserver = NopBackgroundTaskObserver(), + ) -> None: self.event_producer = event_producer self.ongoing_tasks = weakref.WeakSet() self.task_update_queues = defaultdict(set) self.dict_lock = asyncio.Lock() + self._metric_observer = bgtask_observer def register_event_handlers(self, event_dispatcher: EventDispatcher) -> None: """ @@ -280,6 +302,8 @@ async def _wrapper_task( event_cls: Type[BgtaskDoneEvent] | Type[BgtaskCancelledEvent] | Type[BgtaskFailedEvent] = ( BgtaskDoneEvent ) + self._metric_observer.observe_bgtask_started(task_name=task_name or func.__name__) + start = time.perf_counter() try: message = await func(reporter, **kwargs) or "" task_status = "bgtask_done" @@ -292,6 +316,10 @@ async def _wrapper_task( message = repr(e) log.exception("Task {} ({}): unhandled error", task_id, task_name) finally: + duration = time.perf_counter() - start + self._metric_observer.observe_bgtask_done( + task_name=task_name or func.__name__, status=task_status, duration=duration + ) redis_producer = self.event_producer.redis_client async def _pipe_builder(r: Redis): diff --git a/src/ai/backend/common/events.py b/src/ai/backend/common/events.py index ed91bac87b3..8bb6bbc4e1e 100644 --- a/src/ai/backend/common/events.py +++ b/src/ai/backend/common/events.py @@ -7,6 +7,7 @@ import logging import secrets import socket +import time import uuid from collections import defaultdict from typing import ( @@ -938,6 +939,24 @@ async def rate_control(self, opts: CoalescingOptions | None) -> bool: return True +class EventObserver(Protocol): + def observe_event_success(self, *, event_type: str, duration: float) -> None: ... + + def observe_event_failure( + self, *, event_type: str, duration: float, exception: Exception + ) -> None: ... + + +class NopEventObserver: + def observe_event_success(self, *, event_type: str, duration: float) -> None: + pass + + def observe_event_failure( + self, *, event_type: str, duration: float, exception: Exception + ) -> None: + pass + + class EventDispatcher(aobject): """ We have two types of event handlers: consumer and subscriber. @@ -963,6 +982,7 @@ class EventDispatcher(aobject): _log_events: bool _consumer_name: str + _metric_observer: EventObserver def __init__( self, @@ -976,6 +996,7 @@ def __init__( node_id: str | None = None, consumer_exception_handler: AsyncExceptionHandler | None = None, subscriber_exception_handler: AsyncExceptionHandler | None = None, + event_observer: EventObserver = NopEventObserver(), ) -> None: _redis_config = redis_config.copy() if service_name: @@ -990,6 +1011,7 @@ def __init__( self._stream_key = stream_key self._consumer_group = consumer_group self._consumer_name = _generate_consumer_id(node_id) + self._metric_observer = event_observer self.consumer_taskgroup = PersistentTaskGroup( name="consumer_taskgroup", exception_handler=consumer_exception_handler, @@ -1165,15 +1187,29 @@ async def _consume_loop(self) -> None: return if msg_data is None: continue + event_type = "unknown" + start = time.perf_counter() try: + decoded = msg_data[b"name"].decode() + if decoded and isinstance(decoded, str): + event_type = decoded await self.dispatch_consumers( - msg_data[b"name"].decode(), + decoded, msg_data[b"source"].decode(), msgpack.unpackb(msg_data[b"args"]), ) + self._metric_observer.observe_event_success( + event_type=event_type, + duration=time.perf_counter() - start, + ) except asyncio.CancelledError: raise - except Exception: + except Exception as e: + self._metric_observer.observe_event_failure( + event_type=event_type, + duration=time.perf_counter() - start, + exception=e, + ) log.exception("EventDispatcher.consume(): unexpected-error") @preserve_termination_log @@ -1189,15 +1225,29 @@ async def _subscribe_loop(self) -> None: return if msg_data is None: continue + event_type = "unknown" + start = time.perf_counter() try: + decoded = msg_data[b"name"].decode() + if decoded and isinstance(decoded, str): + event_type = decoded await self.dispatch_subscribers( - msg_data[b"name"].decode(), + decoded, msg_data[b"source"].decode(), msgpack.unpackb(msg_data[b"args"]), ) + self._metric_observer.observe_event_success( + event_type=event_type, + duration=time.perf_counter() - start, + ) except asyncio.CancelledError: raise - except Exception: + except Exception as e: + self._metric_observer.observe_event_failure( + event_type=event_type, + duration=time.perf_counter() - start, + exception=e, + ) log.exception("EventDispatcher.subscribe(): unexpected-error") diff --git a/src/ai/backend/common/events_experimental.py b/src/ai/backend/common/events_experimental.py index e81ef3c7ad2..1049cc3c097 100644 --- a/src/ai/backend/common/events_experimental.py +++ b/src/ai/backend/common/events_experimental.py @@ -3,7 +3,7 @@ import time from collections import defaultdict from collections.abc import AsyncIterable -from typing import Any +from typing import Any, Protocol import hiredis from aiomonitor.task import preserve_termination_log @@ -157,6 +157,24 @@ async def read_stream_by_group( raise +class EventObserver(Protocol): + def observe_event_success(self, *, event_type: str, duration: float) -> None: ... + + def observe_event_failure( + self, *, event_type: str, duration: float, exception: Exception + ) -> None: ... + + +class NopEventObserver: + def observe_event_success(self, *, event_type: str, duration: float) -> None: + pass + + def observe_event_failure( + self, *, event_type: str, duration: float, exception: Exception + ) -> None: + pass + + class EventDispatcher(_EventDispatcher): redis_config: EtcdRedisConfig db: int diff --git a/src/ai/backend/common/metrics/BUILD b/src/ai/backend/common/metrics/BUILD new file mode 100644 index 00000000000..73574424040 --- /dev/null +++ b/src/ai/backend/common/metrics/BUILD @@ -0,0 +1 @@ +python_sources(name="src") diff --git a/src/ai/backend/common/metrics/http.py b/src/ai/backend/common/metrics/http.py new file mode 100644 index 00000000000..ca156801f37 --- /dev/null +++ b/src/ai/backend/common/metrics/http.py @@ -0,0 +1,55 @@ +import time +from typing import Protocol + +from aiohttp import web +from aiohttp.typedefs import Handler, Middleware + + +class APIMetricObserverProtocol(Protocol): + def observe_request( + self, *, method: str, endpoint: str, status_code: int, duration: float + ) -> None: ... + + +class PrometheusAPIMetric(Protocol): + def to_prometheus(self) -> str: ... + + +def build_api_metric_middleware(metric: APIMetricObserverProtocol) -> Middleware: + @web.middleware + async def metric_middleware(request: web.Request, handler: Handler) -> web.StreamResponse: + # normalize path + method = request.method + endpoint = getattr(request.match_info.route.resource, "canonical", request.path) + status_code = -1 + start = time.perf_counter() + try: + resp = await handler(request) + status_code = resp.status + except web.HTTPError as e: + status_code = e.status_code + raise + except Exception: + status_code = 500 + raise + else: + return resp + finally: + end = time.perf_counter() + elapsed = end - start + metric.observe_request( + method=method, endpoint=endpoint, status_code=status_code, duration=elapsed + ) + + return metric_middleware + + +def build_prometheus_metrics_handler(prometheus_metric: PrometheusAPIMetric) -> Handler: + async def prometheus_metrics_handler(request: web.Request) -> web.Response: + """ + Returns the Prometheus metrics. + """ + metrics = prometheus_metric.to_prometheus() + return web.Response(text=metrics, content_type="text/plain") + + return prometheus_metrics_handler diff --git a/src/ai/backend/common/metrics/metric.py b/src/ai/backend/common/metrics/metric.py new file mode 100644 index 00000000000..443bbcce7a4 --- /dev/null +++ b/src/ai/backend/common/metrics/metric.py @@ -0,0 +1,205 @@ +import asyncio +import os +from typing import Optional, Self + +import psutil +from prometheus_client import Counter, Gauge, Histogram, generate_latest + + +class APIMetricObserver: + _instance: Optional[Self] = None + + _request_count: Counter + _request_duration_sec: Histogram + + def __init__(self) -> None: + self._request_count = Counter( + name="backendai_api_request_count", + documentation="Total number of API requests", + labelnames=["method", "endpoint", "status_code"], + ) + self._request_duration_sec = Histogram( + name="backendai_api_request_duration_sec", + documentation="Duration of API requests in milliseconds", + labelnames=["method", "endpoint", "status_code"], + buckets=[0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10, 30], + ) + + @classmethod + def instance(cls) -> Self: + if cls._instance is None: + cls._instance = cls() + return cls._instance + + def _inc_request_total(self, *, method: str, endpoint: str, status_code: int) -> None: + self._request_count.labels(method=method, endpoint=endpoint, status_code=status_code).inc() + + def _observe_request_duration( + self, *, method: str, endpoint: str, status_code: int, duration: float + ) -> None: + self._request_duration_sec.labels( + method=method, + endpoint=endpoint, + status_code=status_code, + ).observe(duration) + + def observe_request( + self, *, method: str, endpoint: str, status_code: int, duration: float + ) -> None: + self._inc_request_total(method=method, endpoint=endpoint, status_code=status_code) + self._observe_request_duration( + method=method, endpoint=endpoint, status_code=status_code, duration=duration + ) + + +class EventMetricObserver: + _instance: Optional[Self] = None + + _event_count: Counter + _event_failure_count: Counter + _event_processing_time_sec: Histogram + + def __init__(self) -> None: + self._event_count = Counter( + name="backendai_event_count", + documentation="Total number of events processed", + labelnames=["event_type"], + ) + self._event_failure_count = Counter( + name="backendai_event_failure_count", + documentation="Number of failed events", + labelnames=["event_type", "exception"], + ) + self._event_processing_time_sec = Histogram( + name="backendai_event_processing_time_sec", + documentation="Processing time of events in seconds", + labelnames=["event_type", "status"], + buckets=[0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10, 30], + ) + + @classmethod + def instance(cls) -> Self: + if cls._instance is None: + cls._instance = cls() + return cls._instance + + def observe_event_success(self, *, event_type: str, duration: float) -> None: + self._event_count.labels(event_type=event_type).inc() + self._event_processing_time_sec.labels(event_type=event_type, status="success").observe( + duration + ) + + def observe_event_failure( + self, *, event_type: str, duration: float, exception: Exception + ) -> None: + exception_name = exception.__class__.__name__ + self._event_failure_count.labels(event_type=event_type, exeception=exception_name).inc() + self._event_count.labels(event_type=event_type).inc() + self._event_processing_time_sec.labels(event_type=event_type, status="failure").observe( + duration + ) + + +class BgTaskMetricObserver: + _instance: Optional[Self] = None + + _bgtask_count: Gauge + _bgtask_done_count: Counter + _bgtask_processing_time: Histogram + + def __init__(self) -> None: + self._bgtask_count = Gauge( + name="backendai_bgtask_count", + documentation="Total number of background tasks processed", + labelnames=["task_name"], + ) + self._bgtask_done_count = Counter( + name="backendai_bgtask_done_count", + documentation="Number of completed background tasks", + labelnames=["task_name", "status"], + ) + self._bgtask_processing_time = Histogram( + name="backendai_bgtask_processing_time_sec", + documentation="Processing time of background tasks in seconds", + labelnames=["task_name", "status"], + buckets=[0.1, 1, 10, 30, 60, 300, 600], + ) + + @classmethod + def instance(cls) -> Self: + if cls._instance is None: + cls._instance = cls() + return cls._instance + + def observe_bgtask_started(self, *, task_name: str) -> None: + self._bgtask_count.labels(task_name=task_name).inc() + + def observe_bgtask_done(self, *, task_name: str, status: str, duration: float) -> None: + self._bgtask_count.labels(task_name=task_name).dec() + self._bgtask_processing_time.labels(task_name=task_name, status=status).observe(duration) + self._bgtask_done_count.labels(task_name=task_name, status=status).inc() + + +class SystemMetricObserver: + _instance: Optional[Self] = None + + _async_task_count: Gauge + _cpu_usage_percent: Gauge + _memory_used_rss: Gauge + _memory_used_vms: Gauge + + def __init__(self) -> None: + self._async_task_count = Gauge( + name="backendai_async_task_count", + documentation="Number of active async tasks", + ) + self._cpu_usage_percent = Gauge( + name="backendai_cpu_usage_percent", + documentation="CPU usage of the process", + ) + self._memory_used_rss = Gauge( + name="backendai_memory_used_rss", + documentation="Memory used by the process in RSS", + ) + self._memory_used_vms = Gauge( + name="backendai_memory_used_vms", + documentation="Memory used by the process in VMS", + ) + + @classmethod + def instance(cls) -> Self: + if cls._instance is None: + cls._instance = cls() + return cls._instance + + def observe(self) -> None: + self._async_task_count.set(len(asyncio.all_tasks())) + proc = psutil.Process(os.getpid()) + self._cpu_usage_percent.set(proc.cpu_percent()) + self._memory_used_rss.set(proc.memory_info().rss) + self._memory_used_vms.set(proc.memory_info().vms) + + +class CommonMetricRegistry: + _instance: Optional[Self] = None + + api: APIMetricObserver + event: EventMetricObserver + bgtask: BgTaskMetricObserver + system: SystemMetricObserver + + def __init__(self) -> None: + self.api = APIMetricObserver.instance() + self.event = EventMetricObserver.instance() + self.bgtask = BgTaskMetricObserver.instance() + self.system = SystemMetricObserver.instance() + + @classmethod + def instance(cls): + if cls._instance is None: + cls._instance = cls() + return cls._instance + + def to_prometheus(self) -> str: + self.system.observe() + return generate_latest().decode("utf-8") diff --git a/src/ai/backend/manager/api/context.py b/src/ai/backend/manager/api/context.py index 7616fe51384..5d7cf4bb5dd 100644 --- a/src/ai/backend/manager/api/context.py +++ b/src/ai/backend/manager/api/context.py @@ -4,6 +4,7 @@ import attrs +from ai.backend.common.metrics.metric import CommonMetricRegistry from ai.backend.manager.plugin.network import NetworkPluginContext if TYPE_CHECKING: @@ -56,3 +57,8 @@ class RootContext(BaseContext): error_monitor: ErrorPluginContext stats_monitor: StatsPluginContext background_task_manager: BackgroundTaskManager + metrics: CommonMetricRegistry + + def __init__(self, *, metrics: CommonMetricRegistry = CommonMetricRegistry(), **kwargs) -> None: + super().__init__(**kwargs) + self.metrics = metrics diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index c695a4e1317..ddff3e5f661 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -50,6 +50,11 @@ ) from ai.backend.common.events import EventDispatcher, EventProducer, KernelLifecycleEventReason from ai.backend.common.events_experimental import EventDispatcher as ExperimentalEventDispatcher +from ai.backend.common.metrics.http import ( + build_api_metric_middleware, + build_prometheus_metrics_handler, +) +from ai.backend.common.metrics.metric import CommonMetricRegistry from ai.backend.common.msgpack import DEFAULT_PACK_OPTS, DEFAULT_UNPACK_OPTS from ai.backend.common.plugin.hook import ALL_COMPLETED, PASSED, HookPluginContext from ai.backend.common.plugin.monitor import INCREMENT @@ -430,6 +435,7 @@ async def event_dispatcher_ctx(root_ctx: RootContext) -> AsyncIterator[None]: log_events=root_ctx.local_config["debug"]["log-events"], consumer_group=EVENT_DISPATCHER_CONSUMER_GROUP, node_id=root_ctx.local_config["manager"]["id"], + event_observer=root_ctx.metrics.event, ) yield await root_ctx.event_producer.close() @@ -689,7 +695,10 @@ def __init__(self, root_ctx: RootContext) -> None: self.root_ctx = root_ctx async def __aenter__(self) -> None: - self.root_ctx.background_task_manager = BackgroundTaskManager(self.root_ctx.event_producer) + self.root_ctx.background_task_manager = BackgroundTaskManager( + self.root_ctx.event_producer, + bgtask_observer=self.root_ctx.metrics.bgtask, + ) async def __aexit__(self, *exc_info) -> None: pass @@ -804,13 +813,14 @@ def build_root_app( scheduler_opts: Optional[Mapping[str, Any]] = None, ) -> web.Application: public_interface_objs.clear() + root_ctx = RootContext(metrics=CommonMetricRegistry()) app = web.Application( middlewares=[ exception_middleware, api_middleware, + build_api_metric_middleware(root_ctx.metrics.api), ] ) - root_ctx = RootContext() global_exception_handler = functools.partial(handle_loop_error, root_ctx) loop = asyncio.get_running_loop() loop.set_exception_handler(global_exception_handler) @@ -881,6 +891,9 @@ async def _call_cleanup_context_shutdown_handlers(app: web.Application) -> None: # should be done in create_app() in other modules. cors.add(app.router.add_route("GET", r"", hello)) cors.add(app.router.add_route("GET", r"/", hello)) + cors.add( + app.router.add_route("GET", r"/metrics", build_prometheus_metrics_handler(root_ctx.metrics)) + ) if subapp_pkgs is None: subapp_pkgs = [] for pkg_name in subapp_pkgs: diff --git a/src/ai/backend/storage/api/manager.py b/src/ai/backend/storage/api/manager.py index 3f1ffe61d34..b5bb31567c5 100644 --- a/src/ai/backend/storage/api/manager.py +++ b/src/ai/backend/storage/api/manager.py @@ -40,6 +40,7 @@ VolumeMounted, VolumeUnmounted, ) +from ai.backend.common.metrics.http import build_api_metric_middleware from ai.backend.common.types import AgentId, BinarySize, ItemResult, QuotaScopeID, ResultSet from ai.backend.logging import BraceStyleAdapter from ai.backend.storage.exception import ExecutionError @@ -101,6 +102,13 @@ async def check_status(request: web.Request) -> web.Response: ) +@skip_token_auth +async def prometheus_metrics_handler(request: web.Request) -> web.Response: + root_ctx: RootContext = request.app["ctx"] + metrics = root_ctx.metric_registry.to_prometheus() + return web.Response(text=metrics, content_type="text/plain") + + @ctxmgr def handle_fs_errors( volume: AbstractVolume, @@ -1145,6 +1153,7 @@ async def init_manager_app(ctx: RootContext) -> web.Application: app = web.Application( middlewares=[ token_auth_middleware, + build_api_metric_middleware(ctx.metric_registry.api), ], ) app["ctx"] = ctx @@ -1152,6 +1161,7 @@ async def init_manager_app(ctx: RootContext) -> web.Application: app["app_ctx"] = app_ctx app.on_shutdown.append(_shutdown) app.router.add_route("GET", "/", check_status) + app.router.add_route("GET", "/metrics", prometheus_metrics_handler) app.router.add_route("GET", "/status", check_status) app.router.add_route("GET", "/volumes", get_volumes) app.router.add_route("GET", "/volume/hwinfo", get_hwinfo) diff --git a/src/ai/backend/storage/context.py b/src/ai/backend/storage/context.py index 6b96f120788..bda892e25ac 100644 --- a/src/ai/backend/storage/context.py +++ b/src/ai/backend/storage/context.py @@ -21,6 +21,7 @@ EventDispatcher, EventProducer, ) +from ai.backend.common.metrics.metric import CommonMetricRegistry from ai.backend.logging import BraceStyleAdapter from .abc import AbstractVolume @@ -102,6 +103,7 @@ class RootContext: event_producer: EventProducer event_dispatcher: EventDispatcher watcher: WatcherClient | None + metric_registry: CommonMetricRegistry def __init__( self, @@ -115,6 +117,7 @@ def __init__( event_dispatcher: EventDispatcher, watcher: WatcherClient | None, dsn: Optional[str] = None, + metric_registry: CommonMetricRegistry = CommonMetricRegistry.instance(), ) -> None: self.volumes = {} self.pid = pid @@ -131,6 +134,7 @@ def __init__( allow_credentials=False, expose_headers="*", allow_headers="*" ), } + self.metric_registry = metric_registry async def __aenter__(self) -> None: self.client_api_app = await init_client_app(self) diff --git a/src/ai/backend/storage/server.py b/src/ai/backend/storage/server.py index 4e310b50541..b835d45001f 100644 --- a/src/ai/backend/storage/server.py +++ b/src/ai/backend/storage/server.py @@ -26,6 +26,7 @@ from ai.backend.common.defs import REDIS_STREAM_DB from ai.backend.common.events import EventDispatcher, EventProducer from ai.backend.common.events_experimental import EventDispatcher as ExperimentalEventDispatcher +from ai.backend.common.metrics.metric import CommonMetricRegistry from ai.backend.common.msgpack import DEFAULT_PACK_OPTS, DEFAULT_UNPACK_OPTS from ai.backend.common.types import safe_print_redis_config from ai.backend.common.utils import env_info @@ -98,7 +99,7 @@ async def server_main( aiomon_started = True except Exception as e: log.warning("aiomonitor could not start but skipping this error to continue", exc_info=e) - + metric_registry = CommonMetricRegistry() try: etcd = load_shared_config(local_config) try: @@ -136,6 +137,7 @@ async def server_main( log_events=local_config["debug"]["log-events"], node_id=local_config["storage-proxy"]["node-id"], consumer_group=EVENT_DISPATCHER_CONSUMER_GROUP, + event_observer=metric_registry, ) log.info( "PID: {0} - Event dispatcher created. (redis_config: {1})", @@ -169,6 +171,7 @@ async def server_main( event_producer=event_producer, event_dispatcher=event_dispatcher, watcher=watcher_client, + metric_registry=metric_registry, ) async with ctx: m.console_locals["ctx"] = ctx diff --git a/src/ai/backend/wsproxy/server.py b/src/ai/backend/wsproxy/server.py index aa689fdf561..fe42d1c8864 100644 --- a/src/ai/backend/wsproxy/server.py +++ b/src/ai/backend/wsproxy/server.py @@ -24,6 +24,11 @@ from aiohttp.typedefs import Middleware from setproctitle import setproctitle +from ai.backend.common.metrics.http import ( + build_api_metric_middleware, + build_prometheus_metrics_handler, +) +from ai.backend.common.metrics.metric import CommonMetricRegistry from ai.backend.common.msgpack import DEFAULT_PACK_OPTS, DEFAULT_UNPACK_OPTS from ai.backend.common.utils import env_info from ai.backend.logging import BraceStyleAdapter, Logger, LogLevel @@ -233,11 +238,13 @@ def build_root_app( cleanup_contexts: Sequence[CleanupContext] | None = None, subapp_pkgs: Sequence[str] = [], ) -> web.Application: + metric_registry = CommonMetricRegistry.instance() app = web.Application( middlewares=[ request_context_aware_middleware, exception_middleware, api_middleware, + build_api_metric_middleware(metric_registry.api), ] ) root_ctx = RootContext() @@ -293,6 +300,9 @@ async def _call_cleanup_context_shutdown_handlers(app: web.Application) -> None: cors.add(app.router.add_route("GET", r"", hello)) cors.add(app.router.add_route("GET", r"/", hello)) cors.add(app.router.add_route("GET", "/status", status)) + cors.add( + app.router.add_route("GET", "/metrics", build_prometheus_metrics_handler(metric_registry)) + ) if subapp_pkgs is None: subapp_pkgs = [] for pkg_name in subapp_pkgs: