diff --git a/jupyter_server/prometheus/metrics.py b/jupyter_server/prometheus/metrics.py index 73a5a02040..70df23dc38 100644 --- a/jupyter_server/prometheus/metrics.py +++ b/jupyter_server/prometheus/metrics.py @@ -5,7 +5,16 @@ conventions for metrics & labels. """ +from prometheus_client import Counter + try: + import notebook # type: ignore + + if notebook.__name__ != "notebook": + # avoid double-importing myself if nbclassic is shimming jupyter_server into notebook, + # in which case notebook.__name__ will be 'jupyter_server' + _msg = "Not importing jupyter_server metrics under two names" + raise ImportError(_msg) # Jupyter Notebook also defines these metrics. Re-defining them results in a ValueError. # Try to de-duplicate by using the ones in Notebook if available. # See https://github.com/jupyter/jupyter_server/issues/209 @@ -34,3 +43,9 @@ "counter for how many kernels are running labeled by type", ["type"], ) + +KERNEL_RESTARTS = Counter( + "jupyter_kernel_restarts", + "counter for how many kernel restarts, labeled by kernel_name and source (user or restarter)", + ["kernel_name", "source"], +) diff --git a/jupyter_server/services/kernels/handlers.py b/jupyter_server/services/kernels/handlers.py index db8a39ebde..9c5d27d2a7 100644 --- a/jupyter_server/services/kernels/handlers.py +++ b/jupyter_server/services/kernels/handlers.py @@ -16,6 +16,7 @@ from tornado import web from jupyter_server.auth import authorized +from jupyter_server.prometheus.metrics import KERNEL_RESTARTS from jupyter_server.utils import url_escape, url_path_join from ...base.handlers import APIHandler @@ -104,6 +105,7 @@ async def post(self, kernel_id, action): self.set_status(500) else: model = await ensure_async(km.kernel_model(kernel_id)) + KERNEL_RESTARTS.labels(kernel_name=model["name"], source="user").inc() self.write(json.dumps(model, default=json_default)) self.finish() diff --git a/jupyter_server/services/kernels/kernelmanager.py b/jupyter_server/services/kernels/kernelmanager.py index b1ca1bbfa2..c89406fe1f 100644 --- a/jupyter_server/services/kernels/kernelmanager.py +++ b/jupyter_server/services/kernels/kernelmanager.py @@ -38,7 +38,7 @@ ) from jupyter_server._tz import isoformat, utcnow -from jupyter_server.prometheus.metrics import KERNEL_CURRENTLY_RUNNING_TOTAL +from jupyter_server.prometheus.metrics import KERNEL_CURRENTLY_RUNNING_TOTAL, KERNEL_RESTARTS from jupyter_server.utils import ApiPath, import_item, to_os_path @@ -179,6 +179,10 @@ def __init__(self, **kwargs): # Methods for managing kernels and sessions # ------------------------------------------------------------------------- + def _handle_kernel_restart(self, kernel_id, kernel_name): + """notice that a kernel restarted""" + KERNEL_RESTARTS.labels(kernel_name=kernel_name, source="restarter").inc() + def _handle_kernel_died(self, kernel_id): """notice that a kernel died""" self.log.warning("Kernel %s died, removing from map.", kernel_id) @@ -279,6 +283,11 @@ async def _finish_kernel_start(self, kernel_id): lambda: self._handle_kernel_died(kernel_id), "dead", ) + # register callback to count restarts + self.add_restart_callback( + kernel_id, + lambda: self._handle_kernel_restart(kernel_id, km.kernel_name), + ) def ports_changed(self, kernel_id): """Used by ZMQChannelsHandler to determine how to coordinate nudge and replays.