From 1617f24407b3bd9b014d8463ad5c5fe238506879 Mon Sep 17 00:00:00 2001 From: Charles Turner Date: Tue, 21 Jan 2025 15:37:07 +0800 Subject: [PATCH] Update readme to match changes to configuration --- README.md | 112 +++++++++++++---- config_diff.git | 322 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 410 insertions(+), 24 deletions(-) create mode 100644 config_diff.git diff --git a/README.md b/README.md index c342383..e1640cc 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ This needs to be added to the system config for ipython, or it can be added to y If this package is used within a Jupyter notebook, telemetry calls will be made asynchronously, so as to not block the execution of the notebook. This means that the telemetry calls will be made in the background, and will not affect the performance of the notebook. -If you are using this in a REPL, telemetry calls are currently synchronous, and will block the execution of the code until the telemetry call is made. This will be fixed in a future release. +Outside a Jupyter notebook, telemetry calls will be made in a new python process using the multiprocessing module, and so will be non-blocking but may have a small overhead. ![PyPI version](https://img.shields.io/pypi/v/access_py_telemetry.svg) ![Build Status](https://img.shields.io/travis/access-nri/access_py_telemetry.svg) @@ -68,13 +68,76 @@ Contains IPython extensions to automatically add telemetry to catalog usage. ### Registering & deregistering functions for telemetry -To add a function to the list of functions about which usage information is collected when telemetry is enabled, use the `TelemetryRegister` class, and it's `register` method. You can pass the function name as a string, or the function itself. +#### The TelemetryRegister class + +The `TelemetryRegister` class is used to register and deregister functions for telemetry. By default, it will read from `config.yaml` to get the list of functions to register. + +A sample `config.yaml` file is shown below: + +```yaml +intake: + catalog: + - esm_datastore.search + - DfFileCatalog.search + - DfFileCatalog.__getitem__ +payu: + run: + - Experiment.run + restart: + - Experiment.restart +``` + +This config file has two main purposes: to provide a list of function calls which ought to be tracked, and to specify where the telemetry data should be sent. + +In this example, there are three endpoints: +1. `intake/catalog` +2. `payu/run` +3. `payu/restart` +which track the corresponding sets of functions: +1. `{esm_datastore.search, DfFileCatalog.search, DfFileCatalog.__getitem__}` +2. `{Experiment.run}` +3. `{Experiment.restart}` -> **Important** -> ___ -> Each `service` corresponds to a single `endpoint` in the tracking services app. -> This is something of a misnomer, and will eventually be renamed to something more appropriate. +*Service Names* are built from the config file, and are built by replacing the `/` with a `_` in the endpoint name - ie. +1. `intake_catalog` <=> `intake/catalog` +2. `payu_run` <=> `payu/run` +3. `payu_restart` <=> `payu/restart` +Typically, the top level part service name (eg. `intake`) will correspond to both a Django app and a single client side package (eg. intake, Payu, etc that you wish to track), and the rest of the endpoint will correspond to a view within that app. For example, if you had a package named `executor` for which you wanted to track `run` and `save_results` functions in separate tables, you would have the following config: +```yaml +executor: + run: + - executor.run + save_results: + - executor.save_results +``` + +The corresponding models in the `tracking_services` Django app would be `ExecutorRun` and `ExecutorSaveResults`: + +```python +class ExecutorRun(models.Model): + function_name = models.CharField(max_length=255) + args = JSONField() + kwargs = JSONField() + session_id = models.CharField(max_length=255) + interesting_data = JSONField() + timestamp = models.DateTimeField(auto_now_add=True) + +class ExecutorSaveResults(models.Model): + function_name = models.CharField(max_length=255) + args = JSONField() + kwargs = JSONField() + session_id = models.CharField(max_length=255) + timestamp = models.DateTimeField(auto_now_add=True) + save_filesize = models.IntegerField() + user_id = models.CharField(max_length=255) + execution_time = models.FloatField() + memory_usage = models.FloatField() + cpu_usage = models.FloatField() +``` + + +To add a function to the list of functions about which usage information is collected when telemetry is enabled, use the `TelemetryRegister` class, and it's `register` method. You can pass the function name as a string, or the function itself. ```python from access_py_telemetry.registry import TelemetryRegister @@ -132,7 +195,7 @@ def my_func(): Specifying the `extra_fields` argument will add additional fields to the telemetry data sent to the endpoint. Alternatively, these can be added later: ```python -from access_py_telemetry.utils import ApiHandler +from access_py_telemetry.api import ApiHandler from access_py_telemetry.decorators import ipy_register_func @ipy_register_func("my_service") @@ -145,10 +208,10 @@ api_handler.add_extra_field("my_service", {"interesting_data": interesting_data} Adding fields later may sometimes be necessary, as the data may not be available at the time of registration/function definition, but will be when the function is called. -We can also remove fields from the telemetry data, using the `pop_fields` method. This might be handy for example, if you want to remove a default field. For example, telemetry will include a session ID (bound to the Python interpreter lifetime) by default - if you are writing a CLI tool, you may want to remove this field. +We can also remove fields from the telemetry data, using the `pop_fields` method. This might be handy for example, if you want to remove a default field. For example, telemetry will include a session ID (bound to the Python interpreter lifetime) by default - if you are writing a CLI tool, you will probably want to remove this field. ```python -from access_py_telemetry.utils import ApiHandler +from access_py_telemetry.api import ApiHandler from access_py_telemetry.decorators import register_func @register_func("my_service", extra_fields = [{"cli_config" : ...}, {"interesting_data" : ...}]) @@ -183,8 +246,8 @@ def my_func(): ### Checking registry (Assuming `my_func` has been registered as above) ```python ->>> cat_registry = TelemetryRegister('catalog') ->>> print(cat_registry) +>>> intake_registry = TelemetryRegister('intake_catalog') +>>> print(intake_registry) ["esm_datastore.search", "DfFileCatalog.search", "DfFileCatalog.__getitem__"] >>> my_registry = TelemetryRegister('my_service') >>> print(my_registry) @@ -195,19 +258,19 @@ def my_func(): When you are happy with your telemetry configuration, you can update the default registry with your custom registry. This should be done via a PR, in which you update the `registry.yaml` file with your addtional functionality to track: +In the case of `my_service`, you would add the following to `registry.yaml`: + ```yaml -catalog: - endpoint: /intake/update - items: +intake: + catalog: - esm_datastore.search - DfFileCatalog.search - DfFileCatalog.__getitem__ -+ my_service: -+ endpoint: /my_service/endpoint -+ items: -+ - my_func -+ - my_other_func ++ my: ++ service: ++ - my_func ++ - my_other_func ``` @@ -224,7 +287,7 @@ Presently, please raise an issue on the [tracking-services](https://github.com/A __Once you have an endpoint__, you can send telemetry using the `ApiHandler` class. ```python -from access_py_telemetry.utils import ApiHandler +from access_py_telemetry.api import ApiHandler from xyz import interesting_data @@ -259,7 +322,7 @@ The `ApiHandler` class will send telemetry data to the endpoint you specify. To ``` If you have not registered any extra fields, the `interesting_data` field will not be present. -Configuration of extra fields, etc, should be performed as import time side effects of you code in order to ensure telemetry data is sent correctly & consistently. +Configuration of extra fields, etc, should be performed as import time side effects of you code in order to ensure telemetry data are sent correctly & consistently. #### Implementation details @@ -267,7 +330,7 @@ The `ApiHandler` class is a singleton, so if you want to configure extra fields eg. `myservice/component1.py` ```python -from access_py_telemetry.utils import ApiHandler +from access_py_telemetry.api import ApiHandler api_handler = ApiHandler() service_component1_config = { @@ -278,7 +341,7 @@ api_handler.add_extra_field("myservice", service_component1_config) ``` and `myservice/component2.py` ```python -from access_py_telemetry.utils import ApiHandler +from access_py_telemetry.api import ApiHandler api_handler = ApiHandler() service_component2_config = { @@ -311,7 +374,7 @@ Then, when telemetry is sent, you will see the `component_1_config` and `compone In order to track user sessions, this package uses a Session Identifier, generated using the SessionID class: ```python ->>> from access_py_telemetry.utils import SessionID +>>> from access_py_telemetry.api import SessionID >>> session_id = SessionID() >>> session_id @@ -371,3 +434,4 @@ Note that the date is the first time the project is created. The date signifies the year from which the copyright notice applies. **NEVER** replace with a later year, only ever add later years or a year range. It is not necessary to include subsequent years in the copyright statement at all unless updates have been made at a later time, and even then it is largely discretionary: they are not necessary as copyright is contingent on the lifespan of copyright holder +50 years as per the [Berne Convention](https://en.wikipedia.org/wiki/Berne_Convention). + diff --git a/config_diff.git b/config_diff.git new file mode 100644 index 0000000..2c5602b --- /dev/null +++ b/config_diff.git @@ -0,0 +1,322 @@ +diff --git a/src/access_py_telemetry/__init__.py b/src/access_py_telemetry/__init__.py +index e32d12b..c59a60e 100644 +--- a/src/access_py_telemetry/__init__.py ++++ b/src/access_py_telemetry/__init__.py +@@ -11,8 +11,9 @@ from IPython.core.interactiveshell import InteractiveShell + + from . import _version + from .ast import capture_registered_calls +-from .api import SessionID, ENDPOINTS # noqa +-from .registry import REGISTRIES, RegisterWarning ++from .api import SessionID # noqa ++from .registry import RegisterWarning ++from .utils import ENDPOINTS, REGISTRIES + + + # Make sure that our registries & endpoints match up +diff --git a/src/access_py_telemetry/api.py b/src/access_py_telemetry/api.py +index 82396b7..526db5f 100644 +--- a/src/access_py_telemetry/api.py ++++ b/src/access_py_telemetry/api.py +@@ -13,6 +13,7 @@ import asyncio + import pydantic + import yaml + from pathlib import Path ++from .utils import ENDPOINTS, REGISTRIES + + S = TypeVar("S", bound="SessionID") + H = TypeVar("H", bound="ApiHandler") +@@ -20,8 +21,6 @@ H = TypeVar("H", bound="ApiHandler") + with open(Path(__file__).parent / "config.yaml", "r") as f: + config = yaml.safe_load(f) + +-ENDPOINTS = {registry: content.get("endpoint") for registry, content in config.items()} +-REGISTRIES = {registry for registry in config.keys()} + SERVER_URL = "https://tracking-services-d6c2fd311c12.herokuapp.com" + + +@@ -36,11 +35,9 @@ class ApiHandler: + + _instance = None + _server_url = SERVER_URL[:] +- endpoints = {key: val for key, val in ENDPOINTS.items()} +- registries = {key for key in REGISTRIES} +- _extra_fields: dict[str, dict[str, Any]] = { +- ep_name: {} for ep_name in ENDPOINTS.keys() +- } ++ endpoints = {service: endpoint for service, endpoint in ENDPOINTS.items()} ++ registries = {service for service in REGISTRIES} ++ _extra_fields: dict[str, dict[str, Any]] = {ep_name: {} for ep_name in ENDPOINTS} + _pop_fields: dict[str, list[str]] = {} + + def __new__(cls: Type[H]) -> H: +diff --git a/src/access_py_telemetry/ast.py b/src/access_py_telemetry/ast.py +index 8488f60..e333557 100644 +--- a/src/access_py_telemetry/ast.py ++++ b/src/access_py_telemetry/ast.py +@@ -9,7 +9,8 @@ from IPython.core.getipython import get_ipython + from IPython.core.interactiveshell import ExecutionInfo + + from .api import ApiHandler +-from .registry import TelemetryRegister, REGISTRIES ++from .registry import TelemetryRegister ++from .utils import REGISTRIES + + + api_handler = ApiHandler() +diff --git a/src/access_py_telemetry/config.yaml b/src/access_py_telemetry/config.yaml +index 29ad955..d8023db 100644 +--- a/src/access_py_telemetry/config.yaml ++++ b/src/access_py_telemetry/config.yaml +@@ -1,10 +1,10 @@ +-catalog: +- endpoint: /intake/update +- items: ++intake: ++ catalog: + - esm_datastore.search + - DfFileCatalog.search + - DfFileCatalog.__getitem__ + payu: +- endpoint: /payu/update +- items: ++ run: + - Experiment.run ++ restart: ++ - Experiment.restart +\ No newline at end of file +diff --git a/src/access_py_telemetry/registry.py b/src/access_py_telemetry/registry.py +index f74b416..5bf3090 100644 +--- a/src/access_py_telemetry/registry.py ++++ b/src/access_py_telemetry/registry.py +@@ -4,20 +4,12 @@ SPDX-License-Identifier: Apache-2.0 + """ + + from typing import Type, TypeVar, Iterator, Callable, Any +-from pathlib import Path + import pydantic +-import yaml + import copy ++from .utils import REGISTRIES + + T = TypeVar("T", bound="TelemetryRegister") + +-with open(Path(__file__).parent / "config.yaml", "r") as f: +- config = yaml.safe_load(f) +- +-REGISTRIES = { +- registry: set(content.get("items")) for registry, content in config.items() +-} +- + + class RegisterWarning(UserWarning): + """ +@@ -33,8 +25,6 @@ class TelemetryRegister: + this class is going to be a singleton so that we can register functions to it + from anywhere and have them persist across all telemetry calls. + +- This doesn't actually work - we are going to need one registry per service, so +- we can't use a singleton here. We'll need to refactor this later. + """ + + # Set of registered functions for now - we can add more later or dynamically +diff --git a/src/access_py_telemetry/utils.py b/src/access_py_telemetry/utils.py +new file mode 100644 +index 0000000..7ed1986 +--- /dev/null ++++ b/src/access_py_telemetry/utils.py +@@ -0,0 +1,54 @@ ++""" ++Copyright 2022 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. ++SPDX-License-Identifier: Apache-2.0 ++""" ++ ++from typing import Any ++import yaml ++from pathlib import Path ++from dataclasses import dataclass, field ++ ++ ++with open(Path(__file__).parent / "config.yaml", "r") as f: ++ config = yaml.safe_load(f) ++ ++ ++@dataclass ++class TelemetryRegister: ++ endpoint: str ++ items: set[str] = field(default_factory=set) ++ ++ ++def build_endpoints( ++ config: dict[str, Any], parent: str | None = None ++) -> list[TelemetryRegister]: ++ """ ++ Recursively join the keys of the dictionary until we reach a list ++ Returns list of tuples (path, endpoint_list) ++ """ ++ parent = parent or "" ++ results = [] ++ ++ for key, val in config.items(): ++ if isinstance(val, dict): ++ # Recursively process dictionaries and extend results ++ nested_results = build_endpoints(val, f"{parent}/{key}" if parent else key) ++ results.extend(nested_results) ++ elif isinstance(val, list): ++ # Add tuple of (path, list) when we find a list ++ full_path = "/".join([parent, key]) if parent else key ++ results.append(TelemetryRegister(full_path, set(val))) ++ ++ return results ++ ++ ++ENDPOINTS = { ++ register.endpoint.replace("/", "_"): register.endpoint ++ for register in build_endpoints(config) ++} ++ ++REGISTRIES = { ++ register.endpoint.replace("/", "_"): register.items ++ for register in build_endpoints(config) ++} ++SERVER_URL = "https://tracking-services-d6c2fd311c12.herokuapp.com" +diff --git a/tests/test_api.py b/tests/test_api.py +index 2cd9f8c..c771206 100644 +--- a/tests/test_api.py ++++ b/tests/test_api.py +@@ -74,15 +74,20 @@ def test_api_handler_extra_fields(local_host, api_handler): + with pytest.raises(AttributeError): + session1.extra_fields = {"catalog_version": "1.0"} + +- session1.add_extra_fields("catalog", {"version": "1.0"}) ++ XF_NAME = "intake_catalog" + +- blank_registries = {key: {} for key in session1.registries if key != "catalog"} ++ session1.add_extra_fields(XF_NAME, {"version": "1.0"}) + +- assert session2.extra_fields == {"catalog": {"version": "1.0"}, **blank_registries} ++ blank_registries = {key: {} for key in session1.registries if key != XF_NAME} ++ ++ assert session2.extra_fields == { ++ "intake_catalog": {"version": "1.0"}, ++ **blank_registries, ++ } + + with pytest.raises(KeyError) as excinfo: +- session1.add_extra_fields("catalogue", {"version": "2.0"}) +- assert str(excinfo.value) == "Endpoint catalogue not found" ++ session1.add_extra_fields("catalog", {"version": "2.0"}) ++ assert str(excinfo.value) == "Endpoint catalog not found" + + # Make sure that adding a new sesson doesn't overwrite the old one + session3 = ApiHandler() +@@ -221,7 +226,7 @@ def test_api_handler_invalid_endpoint(api_handler): + # of the _extra_fields attribute + + api_handler.endpoints = { +- "catalog": "/intake/update", ++ "intake_catalog": "/intake/catalog", + } + + api_handler._extra_fields = { +diff --git a/tests/test_decorators.py b/tests/test_decorators.py +index 3021a25..e20470a 100644 +--- a/tests/test_decorators.py ++++ b/tests/test_decorators.py +@@ -13,7 +13,7 @@ def test_ipy_register_func(api_handler, reset_telemetry_register): + """ + + @ipy_register_func( +- service="catalog", ++ service="intake_catalog", + extra_fields={"model": "ACCESS-OM2", "random_number": 2}, + pop_fields=["session_id"], + ) +@@ -22,16 +22,18 @@ def test_ipy_register_func(api_handler, reset_telemetry_register): + + my_func() + +- register = TelemetryRegister("catalog") ++ register = TelemetryRegister("intake_catalog") + api_handler = ApiHandler() +- blank_registries = {key: {} for key in api_handler.registries if key != "catalog"} ++ blank_registries = { ++ key: {} for key in api_handler.registries if key != "intake_catalog" ++ } + + assert api_handler.extra_fields == { +- "catalog": {"model": "ACCESS-OM2", "random_number": 2}, ++ "intake_catalog": {"model": "ACCESS-OM2", "random_number": 2}, + **blank_registries, + } + +- assert api_handler.pop_fields == {"catalog": ["session_id"]} ++ assert api_handler.pop_fields == {"intake_catalog": ["session_id"]} + + assert my_func.__name__ in register + +@@ -47,24 +49,26 @@ async def test_register_func(api_handler, reset_telemetry_register): + """ + + @register_func( +- service="catalog", ++ service="intake_catalog", + extra_fields={"model": "ACCESS-OM2", "random_number": 2}, + pop_fields=["session_id"], + ) + def my_func(): + pass + +- register = TelemetryRegister("catalog") ++ register = TelemetryRegister("intake_catalog") + api_handler = ApiHandler() + +- blank_registries = {key: {} for key in api_handler.registries if key != "catalog"} ++ blank_registries = { ++ key: {} for key in api_handler.registries if key != "intake_catalog" ++ } + + assert api_handler.extra_fields == { +- "catalog": {"model": "ACCESS-OM2", "random_number": 2}, ++ "intake_catalog": {"model": "ACCESS-OM2", "random_number": 2}, + **blank_registries, + } + +- assert api_handler.pop_fields == {"catalog": ["session_id"]} ++ assert api_handler.pop_fields == {"intake_catalog": ["session_id"]} + + assert my_func.__name__ in register + +diff --git a/tests/test_registry.py b/tests/test_registry.py +index e37b0cb..42b469a 100644 +--- a/tests/test_registry.py ++++ b/tests/test_registry.py +@@ -14,8 +14,8 @@ def test_telemetry_register_unique(reset_telemetry_register): + and deregister functions as we would expect. + """ + TelemetryRegister._instances = {} +- session1 = TelemetryRegister("catalog") +- session2 = TelemetryRegister("catalog") ++ session1 = TelemetryRegister("intake_catalog") ++ session2 = TelemetryRegister("intake_catalog") + + # assert session1 is session2 + +@@ -36,7 +36,7 @@ def test_telemetry_register_unique(reset_telemetry_register): + + session1.deregister("test_function", "DfFileCatalog.__getitem__") + +- session3 = TelemetryRegister("catalog") ++ session3 = TelemetryRegister("intake_catalog") + + assert set(session3) == { + "esm_datastore.search", +@@ -50,7 +50,7 @@ def test_telemetry_register_unique(reset_telemetry_register): + + + def test_telemetry_register_validation(reset_telemetry_register): +- session_register = TelemetryRegister("catalog") ++ session_register = TelemetryRegister("intake_catalog") + + with pytest.raises(ValidationError): + session_register.register(1.0)