From 4c8144056f3c05416c137cd46fd70485e4015332 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Wed, 8 Jan 2025 15:05:07 +0100 Subject: [PATCH 01/44] Add files for Foundry client and server --- .github/workflows/ci.yaml | 14 +- Dockerfile | 39 +++++ Makefile | 10 +- _docker_requirements.txt | 137 ++++++++++++++++ aurora/foundry/__init__.py | 12 ++ aurora/foundry/client/__init__.py | 1 + aurora/foundry/client/api.py | 125 +++++++++++++++ aurora/foundry/client/foundry.py | 61 +++++++ aurora/foundry/common/__init__.py | 1 + aurora/foundry/common/channel.py | 254 ++++++++++++++++++++++++++++++ aurora/foundry/common/model.py | 93 +++++++++++ aurora/foundry/server/__init__.py | 3 + aurora/foundry/server/score.py | 144 +++++++++++++++++ docs/_toc.yml | 32 ++-- docs/example_era5.ipynb | 2 +- docs/example_hres_0.1.ipynb | 2 +- docs/example_hres_t0.ipynb | 2 +- docs/foundry/api.rst | 29 ++++ docs/foundry/intro.md | 6 + docs/foundry/server.md | 17 ++ docs/foundry/submission.md | 66 ++++++++ pyproject.toml | 15 +- tests/foundry/__init__.py | 12 ++ tests/foundry/azcopy.py | 74 +++++++++ tests/foundry/conftest.py | 152 ++++++++++++++++++ tests/foundry/runner.py | 66 ++++++++ tests/foundry/test_api.py | 33 ++++ 27 files changed, 1379 insertions(+), 23 deletions(-) create mode 100644 Dockerfile create mode 100644 _docker_requirements.txt create mode 100644 aurora/foundry/__init__.py create mode 100644 aurora/foundry/client/__init__.py create mode 100644 aurora/foundry/client/api.py create mode 100644 aurora/foundry/client/foundry.py create mode 100644 aurora/foundry/common/__init__.py create mode 100644 aurora/foundry/common/channel.py create mode 100644 aurora/foundry/common/model.py create mode 100644 aurora/foundry/server/__init__.py create mode 100644 aurora/foundry/server/score.py create mode 100644 docs/foundry/api.rst create mode 100644 docs/foundry/intro.md create mode 100644 docs/foundry/server.md create mode 100644 docs/foundry/submission.md create mode 100644 tests/foundry/__init__.py create mode 100644 tests/foundry/azcopy.py create mode 100644 tests/foundry/conftest.py create mode 100644 tests/foundry/runner.py create mode 100644 tests/foundry/test_api.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 097fb18..8385f71 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -17,14 +17,24 @@ jobs: name: Test with Python ${{ matrix.version }} steps: - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.version }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build Foundry image + run: | + make docker + - name: Install dependencies run: | - python -m pip install --upgrade pip - python -m pip install --upgrade --no-cache-dir -e '.[dev]' + python -m pip install --upgrade pip + python -m pip install --upgrade --no-cache-dir -e '.[dev]' + - name: Run tests run: | pytest -v --cov=aurora --cov-report term-missing diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c7e4725 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,39 @@ +# Use an official Python runtime as a parent image. +FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest + +WORKDIR /aurora_foundry +COPY ./pyproject.toml . + +# Assuming dependencies are fairly fixed, we can install them first and then copy the rest of the +# code to avoid re-installing dependencies when the code changes. +COPY requirements.txt . +RUN pip install --upgrade pip virtualenv && \ + virtualenv venv -p python3.10 && \ + . venv/bin/activate && \ + pip install -r requirements.txt + +# Download model weights. +RUN ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-small-pretrained.ckpt")' && \ + ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-finetuned.ckpt")' + +COPY ./aurora ./aurora +COPY ./LICENSE.txt . +COPY ./README.md . + +ARG AURORA_REPO_VERSION +RUN [ ! -z "${AURORA_REPO_VERSION}" ] || { echo "AURORA_REPO_VERSION must be set."; exit 1; } && \ + . venv/bin/activate && \ + SETUPTOOLS_SCM_PRETEND_VERSION="$AURORA_REPO_VERSION" pip install -e . + + +# Install `azcopy` and the AML inference server. +RUN wget https://aka.ms/downloadazcopy-v10-linux -O azcopy.tar.gz && \ + cp $(tar -xvzf azcopy.tar.gz | grep azcopy$) /usr/local/bin/azcopy +RUN . ./venv/bin/activate && \ + pip install azureml-inference-server-http + +# Make port 5001 available to the world outside this container. +EXPOSE 5001 +ENV PORT=5001 + +CMD ["./venv/bin/azmlinfsrv", "--entry_script", "aurora/foundry/server/score.py"] diff --git a/Makefile b/Makefile index c423c55..9aecbce 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: install test docs +.PHONY: install test docs docker-requirements docker install: pip install --upgrade pip @@ -11,3 +11,11 @@ test: docs: jupyter-book build docs cp -r docs/_extras/* docs/_build/html/ + +docker-requirements: pyproject.toml + (pip show pip-tools 1>/dev/null) || pip install pip-tools + pip-compile --verbose --output-file _docker_requirements.txt pyproject.toml + +docker: + (pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm + AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t aurora-foundry:latest . diff --git a/_docker_requirements.txt b/_docker_requirements.txt new file mode 100644 index 0000000..c5151cd --- /dev/null +++ b/_docker_requirements.txt @@ -0,0 +1,137 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --output-file=requirements.txt pyproject.toml +# +annotated-types==0.7.0 + # via pydantic +certifi==2024.12.14 + # via + # netcdf4 + # requests +cftime==1.6.4.post1 + # via netcdf4 +charset-normalizer==3.4.1 + # via requests +einops==0.8.0 + # via microsoft-aurora (pyproject.toml) +filelock==3.16.1 + # via + # huggingface-hub + # torch + # triton +fsspec==2024.12.0 + # via + # huggingface-hub + # torch +huggingface-hub==0.27.1 + # via + # microsoft-aurora (pyproject.toml) + # timm +idna==3.10 + # via requests +jinja2==3.1.5 + # via torch +markupsafe==3.0.2 + # via jinja2 +mpmath==1.3.0 + # via sympy +netcdf4==1.7.2 + # via microsoft-aurora (pyproject.toml) +networkx==3.4.2 + # via torch +numpy==2.2.1 + # via + # cftime + # microsoft-aurora (pyproject.toml) + # netcdf4 + # pandas + # scipy + # torchvision + # xarray +nvidia-cublas-cu12==12.4.5.8 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.4.127 + # via torch +packaging==24.2 + # via + # huggingface-hub + # xarray +pandas==2.2.3 + # via xarray +pillow==11.1.0 + # via torchvision +pydantic==2.10.4 + # via microsoft-aurora (pyproject.toml) +pydantic-core==2.27.2 + # via pydantic +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.2 + # via pandas +pyyaml==6.0.2 + # via + # huggingface-hub + # timm +requests==2.32.3 + # via huggingface-hub +scipy==1.15.0 + # via microsoft-aurora (pyproject.toml) +six==1.17.0 + # via python-dateutil +sympy==1.13.1 + # via torch +timm==0.6.13 + # via microsoft-aurora (pyproject.toml) +torch==2.5.1 + # via + # microsoft-aurora (pyproject.toml) + # timm + # torchvision +torchvision==0.20.1 + # via timm +tqdm==4.67.1 + # via huggingface-hub +triton==3.1.0 + # via torch +typing-extensions==4.12.2 + # via + # huggingface-hub + # pydantic + # pydantic-core + # torch +tzdata==2024.2 + # via pandas +urllib3==2.3.0 + # via requests +xarray==2025.1.0 + # via microsoft-aurora (pyproject.toml) diff --git a/aurora/foundry/__init__.py b/aurora/foundry/__init__.py new file mode 100644 index 0000000..5f5f373 --- /dev/null +++ b/aurora/foundry/__init__.py @@ -0,0 +1,12 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +from aurora.foundry.client.api import SubmissionError, submit +from aurora.foundry.client.foundry import FoundryClient +from aurora.foundry.common.channel import BlobStorageCommunication + +__all__ = [ + "BlobStorageCommunication", + "FoundryClient", + "submit", + "SubmissionError", +] diff --git a/aurora/foundry/client/__init__.py b/aurora/foundry/client/__init__.py new file mode 100644 index 0000000..a679a52 --- /dev/null +++ b/aurora/foundry/client/__init__.py @@ -0,0 +1 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py new file mode 100644 index 0000000..2159889 --- /dev/null +++ b/aurora/foundry/client/api.py @@ -0,0 +1,125 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license. + +This is the API that the end user uses to submit jobs to the model running on Azure AI Foundry. +""" + +import logging +from typing import Generator, Literal, Optional, Union + +from pydantic import BaseModel, Field + +from aurora import Batch +from aurora.foundry.client.foundry import AbstractFoundryClient +from aurora.foundry.common.channel import CommunicationChannel, iterate_prediction_files +from aurora.foundry.common.model import models + +__all__ = ["SubmissionError", "submit"] + +logger = logging.getLogger(__name__) + + +class SubmissionInfo(BaseModel): + kind: Literal["submission_info"] + uuid: str + + +class ProgressInfo(BaseModel): + kind: Literal["progress_info"] + uuid: str + completed: bool + progress_percentage: int + error: bool + error_info: str + + +class Answer(BaseModel): + success: bool + message: str + data: Optional[Union[SubmissionInfo, ProgressInfo]] = Field(..., discriminator="kind") + + +class SubmissionError(Exception): + """The submission could not be completed for some reason.""" + + +def submit( + batch: Batch, + model_name: str, + num_steps: int, + client_comm: CommunicationChannel, + host_comm: CommunicationChannel, + foundry_client: AbstractFoundryClient, +) -> Generator[Batch, None, None]: + """Submit a request to Azure AI Foundry and retrieve the predictions. + + Args: + batch (:class:`aurora.Batch`): Initial condition. + model_name (str): Name of the model. This name must be available in + :mod:`aurora_foundry.common.model`. + num_steps (int): Number of prediction steps. + client_comm (:class:`aurora_foundry.common.comm.CommunicationChannel`): Channel that the + client uses to send and receive data. + host_comm (:class:`aurora_foundry.common.comm.CommunicationChannel`): Channel that the host + uses to send and receive data. + foundry_client (:class:`aurora_foundry.client.foundry.AbstractFoundryClient`): Client to + communicate with Azure Foundry AI. + + Yields: + :class:`aurora.Batch`: Predictions. + """ + if model_name not in models: + raise KeyError(f"Model `{model_name}` is not a valid model.") + + # Send a request to the endpoint to produce the predictions. + data = { + "request": { + "action": "submit", + "model_name": model_name, + "num_steps": num_steps, + "host_comm": host_comm.to_spec(), + } + } + answer = Answer(**foundry_client.score(data)) + if not answer.success: + raise SubmissionError(answer.message) + submission_info = answer.data + if not isinstance(submission_info, SubmissionInfo): + raise SubmissionError( + "Server returned no submission information. " + "Cannot determine task UUID to track tasks." + ) + task_uuid = submission_info.uuid + logger.info("Submitted request to endpoint.") + + # Send the initial condition over. + client_comm.send(batch, task_uuid, "input.nc") + + previous_progress: int = 0 + + while True: + # Check on the progress of the task. + data = {"request": {"action": "check", "uuid": task_uuid}} + answer = Answer(**foundry_client.score(data)) + if not answer.success: + raise SubmissionError(answer.message) + progress_info = answer.data + if not isinstance(progress_info, ProgressInfo): + raise SubmissionError( + "Server returned no progress information. " + "Cannot determine whether the task has been completed or not." + ) + + if progress_info.error: + raise SubmissionError(f"Task failed: {progress_info.error_info}") + + if progress_info.progress_percentage > previous_progress: + logger.info(f"Task progress update: {progress_info.progress_percentage}%.") + previous_progress = progress_info.progress_percentage + + if progress_info.completed: + logger.info("Task has been completed!") + break + + logger.info("Retrieving predictions.") + for prediction_name in iterate_prediction_files("prediction.nc", num_steps): + yield client_comm.receive(task_uuid, prediction_name) diff --git a/aurora/foundry/client/foundry.py b/aurora/foundry/client/foundry.py new file mode 100644 index 0000000..09e21e8 --- /dev/null +++ b/aurora/foundry/client/foundry.py @@ -0,0 +1,61 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +import abc +import logging +import os +from typing import Literal + +import requests + +__all__ = ["AbstractFoundryClient", "FoundryClient"] + + +logger = logging.getLogger(__name__) + + +class AbstractFoundryClient(metaclass=abc.ABCMeta): + """A client to talk to Azure AI Foundry.""" + + @abc.abstractmethod + def score(self, data: dict) -> dict: + """Send `data` to the scoring path. + + Args: + data (dict): Data to send. + + Returns: + dict: Answer. + """ + + +class FoundryClient(AbstractFoundryClient): + def __init__(self, endpoint: str, token: str) -> None: + """Initialise. + + Args: + endpoint (str): URL to the endpoint. + token (str): Authorisation token. + """ + self.endpoint = endpoint + self.token = token + + def _req( + self, + method: Literal["POST", "GET"], + path: str, + data: dict | None = None, + ) -> requests.Response: + return requests.request( + method, + os.path.join(self.endpoint, path), + headers={ + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json", + }, + json=data, + ) + + def score(self, data: dict) -> dict: + answer = self._req("POST", "score", {"data": data}) + answer.raise_for_status() + return answer.json() diff --git a/aurora/foundry/common/__init__.py b/aurora/foundry/common/__init__.py new file mode 100644 index 0000000..a679a52 --- /dev/null +++ b/aurora/foundry/common/__init__.py @@ -0,0 +1 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" diff --git a/aurora/foundry/common/channel.py b/aurora/foundry/common/channel.py new file mode 100644 index 0000000..9b394a9 --- /dev/null +++ b/aurora/foundry/common/channel.py @@ -0,0 +1,254 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +import abc +import json +import logging +import os +import subprocess +import tempfile +import time +from pathlib import Path +from typing import Generator, Literal + +from pydantic import BaseModel, HttpUrl + +from aurora import Batch + +__all__ = [ + "CommunicationChannel", + "LocalCommunication", + "BlobStorageCommunication", + "iterate_prediction_files", +] + +logger = logging.getLogger(__name__) + + +class CommunicationChannel: + """A communication channel for sending very large files.""" + + def send(self, batch: Batch, uuid: str, name: str) -> None: + """Send `batch` under the file name `name` and mark the file as done. + + Args: + batch (:class:`aurora.Batch`): Batch to send. + uuid (str): UUID of the task. + name (str): Name of `batch`. + """ + name = f"{uuid}/{name}" + self._send(batch, name) + self._mark(name) + + def receive(self, uuid: str, name: str) -> Batch: + """Receive the batch under the file name `name`. + + This function blocks until the file is ready. + + Args: + uuid (str): UUID of the task. + name (str): Name to receive. + + Returns: + :class:`aurora.Batch`: Batch under the name `name`. + """ + name = f"{uuid}/{name}" + while not self._is_marked(name): + time.sleep(0.5) + return self._receive(name) + + @abc.abstractmethod + def _send(self, batch: Batch, name: str) -> None: + """Send `batch` under the file name `name` without marking the file. + + This method should be implemented. + + Args: + batch (:class:`aurora.Batch`): Batch to send. + name (str): Name of `batch`. + """ + + @abc.abstractmethod + def _receive(self, name: str) -> Batch: + """Receive the batch under the file name `name`. + + This function asserts that the file is ready and should be implemented by implementations. + + Args: + name (str): Name to receive. + + Returns: + :class:`aurora.Batch`: Batch under the file name `name`. + """ + + @abc.abstractmethod + def _mark(self, name: str) -> None: + """Mark the file `name` as done. + + Args: + name (str): File to mark. + """ + + @abc.abstractmethod + def _is_marked(self, name: str) -> bool: + """Check whether the file `name` has been marked. + + Args: + name (str): File to check. + + Returns: + bool: Whether `name` has been marked or not. + """ + + @abc.abstractmethod + def to_spec(self) -> dict[str, str]: + """Convert this channel to specification that can be serialised into JSON. + + Returns: + dict[str, str]: Specification. + """ + + +class LocalCommunication(CommunicationChannel): + """A communication channel via a local folder.""" + + def __init__(self, folder: str | Path) -> None: + """Instantiate. + + Args: + folder (str or Path): Folder to use. + """ + self.folder = Path(folder) + + def to_spec(self) -> dict[str, str]: + return { + "class_name": "LocalCommunication", + "folder": str(self.folder), + } + + class Spec(BaseModel): + class_name: Literal["LocalCommunication"] + folder: Path + + def construct(self) -> "LocalCommunication": + return LocalCommunication(folder=str(self.folder)) + + def _send(self, batch: Batch, name: str) -> None: + target = self.folder / name + target.parent.mkdir(exist_ok=True, parents=True) + batch.to_netcdf(target) + + def _receive(self, name: str) -> Batch: + return Batch.from_netcdf(self.folder / name) + + def _mark(self, name: str) -> None: + target = self.folder / f"{name}.finished" + target.parent.mkdir(exist_ok=True, parents=True) + target.touch() + + def _is_marked(self, name: str) -> bool: + return (self.folder / f"{name}.finished").exists() + + +class BlobStorageCommunication(CommunicationChannel): + """A communication channel via a folder in a Azure Blob Storage container.""" + + _AZCOPY_EXECUTABLE: list[str] = ["azcopy"] + + def __init__(self, blob_folder: str) -> None: + """Instantiate. + + Args: + blob_folder (str): Folder to use. This must be a full URL that includes a SAS token + with read and write permissions. + """ + self.blob_folder = blob_folder + if "?" not in blob_folder: + raise ValueError("Given URL does not appear to contain a SAS token.") + + def to_spec(self) -> dict[str, str]: + return { + "class_name": "BlobStorageCommunication", + "blob_folder": self.blob_folder, + } + + class Spec(BaseModel): + class_name: Literal["BlobStorageCommunication"] + blob_folder: HttpUrl # TODO: Can we force this to be `https`? + + def construct(self) -> "BlobStorageCommunication": + return BlobStorageCommunication(blob_folder=str(self.blob_folder)) + + def _blob_path(self, name: str) -> str: + """For a given file name `name`, get the full path including the SAS token. + + Args: + name (str): File name. + + Returns: + str: Full path including the SAS token. + """ + url, sas = self.blob_folder.split("?", 1) + # Don't use `pathlib.Path` for web URLs! It messes up the protocol. + return f"{os.path.join(url, name)}?{sas}" + + def _azcopy(self, command: list[str]) -> str: + result = subprocess.run( + self._AZCOPY_EXECUTABLE + command, + stdout=subprocess.PIPE, + check=True, + ) + return result.stdout.decode("utf-8") + + def _send(self, batch: Batch, name: str) -> None: + with tempfile.NamedTemporaryFile() as tf: + batch.to_netcdf(tf.name) + self._azcopy(["copy", tf.name, self._blob_path(name)]) + + def _receive(self, name: str) -> Batch: + with tempfile.NamedTemporaryFile() as tf: + self._azcopy(["copy", self._blob_path(name), tf.name]) + return Batch.from_netcdf(tf.name) + + def _mark(self, name: str) -> None: + with tempfile.TemporaryDirectory() as td: + mark_file_path = Path(td) / f"{name}.finished" + mark_file_path.parent.mkdir(exist_ok=True, parents=True) + mark_file_path.touch() + self._azcopy(["copy", str(mark_file_path), self._blob_path(f"{name}.finished")]) + + def _is_marked(self, name: str) -> bool: + out = json.loads( + self._azcopy( + [ + "list", + self._blob_path(f"{name}.finished"), + "--output-type", + "json", + "--output-level", + "essential", + ] + ) + ) + return ( + len(out) == 2 + and isinstance(out[0], dict) + and out[0].get("MessageType", None) == "ListObject" + and isinstance(out[1], dict) + and out[1].get("MessageType", None) == "EndOfJob" + ) + + +def iterate_prediction_files(name: str, num_steps: int) -> Generator[str, None, None]: + """For a file name `name`, generate `num_steps` variations of `name`: one for every prediction + step. + + Args: + name (str): Base file name to derive the file names of the predictions from. + num_steps (int): Number of prediction steps. + + Yields: + str: For every prediction, the derived file name. + """ + name, ext = os.path.splitext(name) + for i in range(num_steps): + yield f"{name}-{i:03d}{ext}" diff --git a/aurora/foundry/common/model.py b/aurora/foundry/common/model.py new file mode 100644 index 0000000..f978725 --- /dev/null +++ b/aurora/foundry/common/model.py @@ -0,0 +1,93 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +import abc +import logging +from typing import Generator + +import torch + +import aurora +from aurora import Batch, rollout + +__all__ = ["Model", "models"] + +logger = logging.getLogger(__name__) + +if torch.cuda.is_available(): + logger.info("GPU detected. Running on GPU.") + target_device = torch.device("cuda") +else: + logger.warning("No GPU available. Running on CPU.") + target_device = torch.device("cpu") + + +class Model(metaclass=abc.ABCMeta): + """A model that can run predictions.""" + + def __init__(self): + """Initialise. + + This creates and loads the model. + """ + self.model = self.create_model() + self.model.eval() + + @abc.abstractmethod + def create_model(self) -> aurora.Aurora: + """Create the model. + + Returns: + :class:`aurora.Aurora`: Model. + """ + + @torch.inference_mode + def run(self, batch: Batch, num_steps: int) -> Generator[Batch, None, None]: + """Perform predictions on the target device. + + Args: + batch (:class:`aurora.Batch`): Initial condition. + num_steps (int): Number of prediction steps. + + Returns: + :class:`aurora.Aurora`: Model. + """ + # Move batch and model to target device. + self.model.to(target_device) # Modifies in-place! + batch = batch.to(target_device) + + # Perform predictions, immediately moving the output to the CPU. + for pred in rollout(self.model, batch, steps=num_steps): + yield pred.to("cpu") + + # Move batch and model back to the CPU. + batch = batch.to("cpu") + self.model.cpu() # Modifies in-place! + + +class AuroraSmall(Model): + name = "aurora-0.25-small-pretrained" + """str: Name of the model.""" + + def create_model(self) -> aurora.Aurora: + model = aurora.AuroraSmall() + model.load_checkpoint("microsoft/aurora", "aurora-0.25-small-pretrained.ckpt") + return model + + +class AuroraFineTuned(Model): + name = "aurora-0.25-finetuned" + """str: Name of the model.""" + + def create_model(self) -> aurora.Aurora: + model = aurora.Aurora() + model.load_checkpoint("microsoft/aurora", "aurora-0.25-finetuned.ckpt") + return model + + +models: dict[str, type[Model]] = {} +"""dict[str, type[Model]]: A dictionary that lists all available models by their name.""" + +for model_class in Model.__subclasses__(): + assert hasattr(model_class, "name"), f"`{model_class.__name__}` is missing `name`." + # `mypy` will complain, because `Model` is abstract, so it cannot be passed to `type`. + models[model_class.name] = model_class # type: ignore[type-abstract] diff --git a/aurora/foundry/server/__init__.py b/aurora/foundry/server/__init__.py new file mode 100644 index 0000000..a44d891 --- /dev/null +++ b/aurora/foundry/server/__init__.py @@ -0,0 +1,3 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +from .score import * # noqa: F403 diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py new file mode 100644 index 0000000..dddc701 --- /dev/null +++ b/aurora/foundry/server/score.py @@ -0,0 +1,144 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +import json +import logging +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Literal, Union +from uuid import uuid4 + +from pydantic import BaseModel, Field + +from aurora.foundry.common.channel import ( + BlobStorageCommunication, + LocalCommunication, + iterate_prediction_files, +) +from aurora.foundry.common.model import models + +__all__ = ["init", "run"] + +# Need to give the name explicitly here, because the script may be run stand-alone. +logger = logging.getLogger("aurora.foundry.server.score") + +CommSpecs = Union[LocalCommunication.Spec, BlobStorageCommunication.Spec] + + +class Submission(BaseModel): + action: Literal["submit"] + host_comm: CommSpecs = Field(..., discriminator="class_name") + model_name: str + num_steps: int + + +class Check(BaseModel): + action: Literal["check"] + uuid: str + + +class Request(BaseModel): + request: Union[Submission, Check] = Field(..., discriminator="action") + + +POOL = ThreadPoolExecutor(max_workers=1) +TASKS = dict() + + +class Task: + def __init__(self, request: Submission): + self.request: Submission = request + # TODO: Make sure that this `uuid` really is unique! + self.uuid: str = str(uuid4()) + self.progress_percentage: int = 0 + self.completed: bool = False + self.exc: Exception | None = None + + def __call__(self) -> None: + try: + request = self.request + host_comm = request.host_comm.construct() + + model_class = models[request.model_name] + model = model_class() + + batch = host_comm.receive(self.uuid, "input.nc") + + logger.info("Running predictions.") + for i, (pred, path) in enumerate( + zip( + model.run(batch, request.num_steps), + iterate_prediction_files("prediction.nc", request.num_steps), + ) + ): + host_comm.send(pred, self.uuid, path) + self.progress_percentage = int((100 * (i + 1)) / request.num_steps) + + self.completed = True + + except Exception as exc: + self.exc = exc + + +def init() -> None: + """Initialise. Do not load the model here, because which model we need depends on the + request.""" + POOL.__enter__() + + +def run(raw_data: str) -> dict: + """Perform predictions. + + Args: + raw_data (str): Request as JSON. + + Returns: + dict: Answer, which will be encoded as JSON. + """ + logger.info("Received request.") + raw = json.loads(raw_data) + request = Request(**raw["data"]).request + + if isinstance(request, Submission): + logger.info("Submitting new task to thread pool.") + task = Task(request) + POOL.submit(task) + TASKS[task.uuid] = task + return { + "success": True, + "message": "Request has been succesfully submitted.", + "data": { + "kind": "submission_info", + "uuid": task.uuid, + }, + } + + elif isinstance(request, Check): + logger.info("Returning the status of an existing task.") + uuid = request.uuid + time.sleep(1) # Sleep here, so the client does not need to. + if uuid not in TASKS: + return { + "success": False, + "message": "Task UUID cannot be found.", + } + else: + task = TASKS[uuid] + return { + "success": True, + "message": "Status check completed.", + "data": { + "kind": "progress_info", + "uuid": uuid, + "completed": task.completed, + "progress_percentage": task.progress_percentage, + "error": task.exc is not None, + "error_info": str(task.exc) if task.exc else "", + }, + } + + else: + # This branch should be unreachable. + return { + "success": False, + "message": "Invalid action.", + } diff --git a/docs/_toc.yml b/docs/_toc.yml index 347b057..b22099d 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -1,13 +1,23 @@ format: jb-book root: intro -chapters: -- file: usage -- file: batch -- file: models -- file: beware -- file: finetuning -- file: example_era5 -- file: example_hres_t0 -- file: example_hres_0.1 -- file: ai_models_plugin -- file: api +parts: +- caption: Main Usage + chapters: + - file: usage + - file: batch + - file: models + - file: beware + - file: finetuning + - file: ai_models_plugin + - file: api +- caption: Examples + chapters: + - file: example_era5 + - file: example_hres_t0 + - file: example_hres_0.1 +- caption: Foundry + chapters: + - file: foundry/intro + - file: foundry/submission + - file: foundry/server + - file: foundry/api diff --git a/docs/example_era5.ipynb b/docs/example_era5.ipynb index 9ba76c2..85c8f29 100644 --- a/docs/example_era5.ipynb +++ b/docs/example_era5.ipynb @@ -5,7 +5,7 @@ "id": "b2f57a10-13a1-4f66-a734-065fc16b17b2", "metadata": {}, "source": [ - "# Example: Predictions for ERA5\n", + "# Predictions for ERA5\n", "\n", "In this example, we will download ERA5 data for 1 Jan 2023 at 0.25 degrees resolution and run Aurora on this data. The fine-tuned version of Aurora specifically only works with IFS HRES T0, so we use the non-fine-tuned version of Aurora in this example.\n", "\n", diff --git a/docs/example_hres_0.1.ipynb b/docs/example_hres_0.1.ipynb index ac9da5a..8ef076e 100644 --- a/docs/example_hres_0.1.ipynb +++ b/docs/example_hres_0.1.ipynb @@ -5,7 +5,7 @@ "id": "b2f57a10-13a1-4f66-a734-065fc16b17b2", "metadata": {}, "source": [ - "# Example: Predictions for HRES at 0.1 degrees\n", + "# Predictions for HRES at 0.1 degrees\n", "\n", "In this example, we will download IFS HRES analysis data at 0.1 degrees resolution for 11 May 2022 from the [Research Data Archive](https://rda.ucar.edu/datasets/d113001/#) and run Aurora on this data. We will use the version of Aurora that was fine-tuned on IFS HRES analysis at 0.1 degrees resolution.\n", "\n", diff --git a/docs/example_hres_t0.ipynb b/docs/example_hres_t0.ipynb index 314b8c6..d765158 100644 --- a/docs/example_hres_t0.ipynb +++ b/docs/example_hres_t0.ipynb @@ -5,7 +5,7 @@ "id": "b2f57a10-13a1-4f66-a734-065fc16b17b2", "metadata": {}, "source": [ - "# Example: Predictions for HRES T0\n", + "# Predictions for HRES T0\n", "\n", "In this example, we will download HRES T0 data for 11 May 2022 from [WeatherBench2](https://weatherbench2.readthedocs.io/) at 0.25 degrees resolution and run Aurora on this data. We will use the version of Aurora that was fine-tuned on IFS HRES T0 in this example.\n", "\n", diff --git a/docs/foundry/api.rst b/docs/foundry/api.rst new file mode 100644 index 0000000..1138439 --- /dev/null +++ b/docs/foundry/api.rst @@ -0,0 +1,29 @@ +Application Programming Interface +================================= + +Submission +---------- +.. autofunction:: aurora.foundry.client.api.submit + +.. autoclass:: aurora.foundry.client.foundry.FoundryClient + :members: __init__ + +.. autoclass:: aurora.foundry.common.channel.BlobStorageCommunication + :members: __init__ + + +Available Models +---------------- +These models need to be referred to by the value of their attribute `name`. + +.. autoclass:: aurora.foundry.common.model.AuroraFineTuned + :members: name + +.. autoclass:: aurora.foundry.common.model.AuroraSmall + :members: name + +Server +------ +.. autofunction:: aurora.foundry.server.score.init + +.. autofunction:: aurora.foundry.server.score.run diff --git a/docs/foundry/intro.md b/docs/foundry/intro.md new file mode 100644 index 0000000..f5076ce --- /dev/null +++ b/docs/foundry/intro.md @@ -0,0 +1,6 @@ +# Aurora on Azure AI Foundry + +Aurora can be run as a model on [Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio). + +This part of the documentation describes how you can produce predictions with Aurora running on Foundry, +and how you can launch a server that hosts Aurora. diff --git a/docs/foundry/server.md b/docs/foundry/server.md new file mode 100644 index 0000000..ed7a4e8 --- /dev/null +++ b/docs/foundry/server.md @@ -0,0 +1,17 @@ +# Running the Inference Server + +Build the Docker image: + +```bash +make docker +``` + +The resulting image will be tagged as `aurora-foundry:latest`. +Upload this image to Azure AI foundry. + +Building the Docker image depends on a list of precompiled dependencies. +If you change the requirements in `pyproject.toml`, this list must be updated: + +```bash +make docker-requirements +``` diff --git a/docs/foundry/submission.md b/docs/foundry/submission.md new file mode 100644 index 0000000..a697314 --- /dev/null +++ b/docs/foundry/submission.md @@ -0,0 +1,66 @@ +# Submitting Predictions + +To produce predictions on Azure AI Foundry, the client will communicate through +a blob storage container, so `azcopy` needs to be available in the local path. +[See here for instructions on how to install `azcopy`.](https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10) + +First, create a client that can communicate with your Azure AI Foundry endpoint: + +```python +from aurora.foundry import FoundryClient + +foundry_client = FoundryClient( + endpoint="https://endpoint/", + token="TOKEN", +) +``` + +Then set up a way to communicate with the model running on Foundry. +You likely want to send data back and forth via a folder in a blob storage container: + +```python +from aurora.foundry import BlobStorageCommunication + +communication = BlobStorageCommunication( + "https://my.blob.core.windows.net/container/folder?" +) +``` + +The SAS token needs read, write, and list rights. +This API does not automatically delete the model initial condition and predictions that are +uploaded to the blob storage folder. +You will need to do that yourself. + +You can now submit requests in the following way: + +```python +from datetime import datetime + +import torch +from aurora import Batch, Metadata + +from aurora.foundry import submit + + +initial_condition = Batch( + surf_vars={k: torch.randn(1, 2, 17, 32) for k in ("2t", "10u", "10v", "msl")}, + static_vars={k: torch.randn(17, 32) for k in ("lsm", "z", "slt")}, + atmos_vars={k: torch.randn(1, 2, 4, 17, 32) for k in ("z", "u", "v", "t", "q")}, + metadata=Metadata( + lat=torch.linspace(90, -90, 17), + lon=torch.linspace(0, 360, 32 + 1)[:-1], + time=(datetime(2020, 6, 1, 12, 0),), + atmos_levels=(100, 250, 500, 850), + ), +) + +for pred in submit( + batch=initial_condition, + model_name="aurora-0.25-small-pretrained", + num_steps=4, + client=communication, + host=communication, + foundry_client=foundry_client, +): + pass # Do something with `pred`. +``` diff --git a/pyproject.toml b/pyproject.toml index 50c76e3..6a3c2bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,11 +18,13 @@ authors = [ license = {file = "LICENSE.txt"} readme = "README.md" keywords = [ - "aurora model", + "aurora model", "atmospheric dynamics", - "atmospheric chemistry", - "deep learning", - "foundation models", + "atmospheric chemistry", + "ocean waves", + "tropical cyclone tracking", + "weather prediction", + "foundation models", ] classifiers = [ "Programming Language :: Python :: 3", @@ -38,6 +40,9 @@ dependencies = [ "einops", "timm==0.6.13", "huggingface-hub", + "pydantic", + "xarray", + "netcdf4", ] [project.optional-dependencies] @@ -52,8 +57,6 @@ dev = [ "pre-commit", "jupyter-book", "scipy", - "xarray", - "netcdf4", ] [project.urls] diff --git a/tests/foundry/__init__.py b/tests/foundry/__init__.py new file mode 100644 index 0000000..3bc61b3 --- /dev/null +++ b/tests/foundry/__init__.py @@ -0,0 +1,12 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +import logging +import sys + +# Expose logging messages. +logger = logging.getLogger() +logger.setLevel("INFO") +stream_handler = logging.StreamHandler(sys.stderr) +formatter = logging.Formatter("[%(levelname)s] %(asctime)s %(name)s: %(message)s") +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) diff --git a/tests/foundry/azcopy.py b/tests/foundry/azcopy.py new file mode 100644 index 0000000..1ea2368 --- /dev/null +++ b/tests/foundry/azcopy.py @@ -0,0 +1,74 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license. + +A mock of `azcopy` designed specifically for the tests here. +""" + +import json +import logging +import shutil +import sys +from pathlib import Path + +import click + +# Expose logging messages. +logger = logging.getLogger() +logger.setLevel("INFO") +stream_handler = logging.StreamHandler(sys.stderr) +formatter = logging.Formatter("[%(levelname)s] %(asctime)s %(name)s: %(message)s") +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + + +def _parse_path(path: str, work_path: Path) -> Path: + if path.startswith("https://"): + path, _ = path.split("?", 1) # Split off the SAS token. + _, path = path.split("blob.core.windows.net/", 1) # Split off the storage account URL. + return work_path / path + else: + # Just a local path. + return Path(path) + + +@click.command(context_settings=dict(ignore_unknown_options=True)) +@click.argument( + "work_path", + required=True, + type=click.Path( + exists=False, file_okay=False, dir_okay=True, resolve_path=True, path_type=Path + ), +) +@click.argument("args", nargs=-1, type=click.UNPROCESSED) +def main(work_path: Path, args: tuple[str, ...]) -> None: + assert len(args) >= 1 + + logger.info(f'Faking `azcopy` call: `azcopy {" ".join(args)}`.') + + if args[0] in {"ls", "list"}: + assert len(args) >= 2 + + path = _parse_path(args[1], work_path) + + out: list[dict[str, str]] = [] + if path.exists(): + out.append({"MessageType": "ListObject"}) + for _ in path.rglob("*"): + out.append({"MessageType": "ListObject"}) + out.append({"MessageType": "EndOfJob"}) + + print(json.dumps(out)) + + elif args[0] in {"cp", "copy"}: + assert len(args) == 3 + + source = _parse_path(args[1], work_path) + target = _parse_path(args[2], work_path) + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(source, target) + + else: + raise RuntimeError(f"Unknown command `{args[0]}`.") + + +if __name__ == "__main__": + main() diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py new file mode 100644 index 0000000..a5c6f53 --- /dev/null +++ b/tests/foundry/conftest.py @@ -0,0 +1,152 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +import json +import subprocess +import time +from pathlib import Path +from typing import Callable, Generator + +import pytest +import requests + +from aurora.foundry.client.foundry import AbstractFoundryClient +from aurora.foundry.common.channel import BlobStorageCommunication, LocalCommunication + + +class MockFoundryClient(AbstractFoundryClient): + def __init__(self, f: Callable[[dict], dict]): + self.f = f + + def score(self, data: dict) -> dict: + return self.f(data) + + +@pytest.fixture( + params=[ + "subprocess-local", + "subprocess-blob", + "docker-local", + ] +) +def mock_foundry_client( + request, + monkeypatch, + tmp_path: Path, + capsys, +) -> Generator[dict, None, None]: + if "subprocess" in request.param: + # Already determine a possible working path for the mock of `azcopy`. It might not be used, + # but we do already need to determine it. + azcopy_mock_work_dir = tmp_path / "azcopy_work" + + # Create a subprocess that mocks the runner. + score_script_path = Path(__file__).parents[2] / "aurora/foundry/server/score.py" + runner_path = Path(__file__).parents[0] / "runner.py" + p = subprocess.Popen( + ["python", runner_path, azcopy_mock_work_dir, score_script_path], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + stdin = p.stdin + stdout = p.stdout + assert stdin is not None and stdout is not None + + def _mock_send(message: dict) -> dict: + # Message will be wrapped into the field `data`. + stdin.write(json.dumps({"data": message}).encode("unicode_escape")) + stdin.write(b"\n") + stdin.flush() + + output = stdout.readline() + if not output: + raise RuntimeError("Runner returned no answer. It likely crashed.") + + return json.loads(output.decode("unicode_escape")) + + # Now we decide whether we do communication locally or via blob storage. If we do + # communication via blob storage, we must mock `azcopy` too. + comm_folder = tmp_path / "communication" + + if "local" in request.param: + # Communicate via a local folder. + yield { + "client_comm": LocalCommunication(comm_folder), + "host_comm": LocalCommunication(comm_folder), + "foundry_client": MockFoundryClient(_mock_send), + } + + else: + # Communicate via blob storage, so mock `azcopy` too. + azcopy_path = Path(__file__).parents[0] / "azcopy.py" + monkeypatch.setattr( + BlobStorageCommunication, + "_AZCOPY_EXECUTABLE", + ["python", str(azcopy_path), str(azcopy_mock_work_dir)], + ) + # The below test URL must start with `https`! + blob_url_with_sas = "https://storageaccount.blob.core.windows.net/container/folder?SAS" + yield { + "client_comm": BlobStorageCommunication(blob_url_with_sas), + "host_comm": BlobStorageCommunication(blob_url_with_sas), + "foundry_client": MockFoundryClient(_mock_send), + } + + # Kill the process upon teardown. + p.terminate() + p.wait() + + elif request.param == "docker-local": + client_comm_folder = tmp_path / "communication" + + # It's important to create the communication folder on the client side already. If we don't, + # Docker will create it, and the permissions will then be wrong. + client_comm_folder.mkdir(exist_ok=True, parents=True) + + # Run the Docker container. Assume that it has already been built. + p = subprocess.Popen( + [ + "docker", + "run", + "-p", + "5001:5001", + "--rm", + "-t", + "-v", + f"{client_comm_folder}:/communication", + "aurora-foundry:latest", + ], + ) + + # Wait for the server to come online. + start = time.time() + while True: + try: + res = requests.get("http://127.0.0.1:5001/") + res.raise_for_status() + except (requests.ConnectionError, requests.HTTPError) as e: + # Try for at most 10 seconds. + if time.time() - start < 10: + time.sleep(0.5) + continue + else: + raise e + break + + def _mock_send(message: dict) -> dict: + # The message will be wrapped in a field `data`. + res = requests.post("http://127.0.0.1:5001/score", data=json.dumps({"data": message})) + res.raise_for_status() + return json.loads(res.text) + + yield { + "client_comm": LocalCommunication(client_comm_folder), + "host_comm": LocalCommunication("/communication"), + "foundry_client": MockFoundryClient(_mock_send), + } + + # Kill the container upon teardown. + p.terminate() + p.wait() + + else: + raise ValueError(f"Bad Foundry mock mode: `{request.param}`.") diff --git a/tests/foundry/runner.py b/tests/foundry/runner.py new file mode 100644 index 0000000..53dee9e --- /dev/null +++ b/tests/foundry/runner.py @@ -0,0 +1,66 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license. + +A mock of the Azure ML inference server for more simple testing. +""" + +import importlib.util as util +import json +import logging +import sys +from pathlib import Path + +import click + +# Expose logging messages. +logger = logging.getLogger() +logger.setLevel("INFO") +stream_handler = logging.StreamHandler(sys.stderr) +formatter = logging.Formatter("[%(levelname)s] %(asctime)s %(name)s: %(message)s") +stream_handler.setFormatter(formatter) +logger.addHandler(stream_handler) + + +@click.command() +@click.argument( + "azcopy_mock_work_path", + required=True, + type=click.Path( + exists=False, file_okay=False, dir_okay=True, resolve_path=True, path_type=Path + ), +) +@click.argument( + "path", + required=True, + type=click.Path(exists=True, file_okay=True, dir_okay=False, resolve_path=True, path_type=Path), +) +def main(azcopy_mock_work_path: Path, path: Path) -> None: + spec = util.spec_from_file_location("score", path) + assert spec is not None, "Could not load specification." + score = util.module_from_spec(spec) + assert score is not None, "Could not load module from specification." + assert spec.loader is not None, "Specification has no loader." + spec.loader.exec_module(score) + + # At this point, we mock `azcopy` too. + azcopy_path = Path(__file__).parents[0] / "azcopy.py" + sys.modules["aurora.foundry"].BlobStorageCommunication._AZCOPY_EXECUTABLE = [ + "python", + str(azcopy_path), + str(azcopy_mock_work_path), + ] + + score.init() + + while True: + raw_data = sys.stdin.readline() + raw_data = raw_data.encode("utf-8").decode("unicode_escape") + + answer = json.dumps(score.run(raw_data)) + + sys.stdout.write(answer.encode("unicode_escape").decode("utf-8")) + sys.stdout.write("\n") + sys.stdout.flush() + + +if __name__ == "__main__": + main() diff --git a/tests/foundry/test_api.py b/tests/foundry/test_api.py new file mode 100644 index 0000000..f84de0d --- /dev/null +++ b/tests/foundry/test_api.py @@ -0,0 +1,33 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +from datetime import datetime + +import torch + +from aurora import Batch, Metadata +from aurora.foundry import submit + + +def test_api(tmp_path, mock_foundry_client: dict): + batch = Batch( + surf_vars={k: torch.randn(1, 2, 17, 32) for k in ("2t", "10u", "10v", "msl")}, + static_vars={k: torch.randn(17, 32) for k in ("lsm", "z", "slt")}, + atmos_vars={k: torch.randn(1, 2, 4, 17, 32) for k in ("z", "u", "v", "t", "q")}, + metadata=Metadata( + lat=torch.linspace(90, -90, 17), + lon=torch.linspace(0, 360, 32 + 1)[:-1], + time=(datetime(2020, 6, 1, 12, 0),), + atmos_levels=(100, 250, 500, 850), + ), + ) + + for i, pred in enumerate( + submit( + batch=batch, + model_name="aurora-0.25-small-pretrained", + num_steps=4, + **mock_foundry_client, + ) + ): + assert isinstance(pred, Batch) + assert pred.metadata.rollout_step == i + 1 From 1176d6f2c142369cb607420f71936132d91d2f98 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Wed, 8 Jan 2025 15:07:47 +0100 Subject: [PATCH 02/44] Fix Docker requirements --- Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index c7e4725..3ba30b1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,11 +6,11 @@ COPY ./pyproject.toml . # Assuming dependencies are fairly fixed, we can install them first and then copy the rest of the # code to avoid re-installing dependencies when the code changes. -COPY requirements.txt . +COPY _docker_requirements.txt . RUN pip install --upgrade pip virtualenv && \ virtualenv venv -p python3.10 && \ . venv/bin/activate && \ - pip install -r requirements.txt + pip install -r _docker_requirements.txt # Download model weights. RUN ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-small-pretrained.ckpt")' && \ @@ -25,7 +25,6 @@ RUN [ ! -z "${AURORA_REPO_VERSION}" ] || { echo "AURORA_REPO_VERSION must be set . venv/bin/activate && \ SETUPTOOLS_SCM_PRETEND_VERSION="$AURORA_REPO_VERSION" pip install -e . - # Install `azcopy` and the AML inference server. RUN wget https://aka.ms/downloadazcopy-v10-linux -O azcopy.tar.gz && \ cp $(tar -xvzf azcopy.tar.gz | grep azcopy$) /usr/local/bin/azcopy From fe534cf793a13c0c8e593dbd9ed159373175c62f Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Wed, 8 Jan 2025 15:34:02 +0100 Subject: [PATCH 03/44] add swagger3.json --- aurora/foundry/server/swagger3.json | 278 ++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 aurora/foundry/server/swagger3.json diff --git a/aurora/foundry/server/swagger3.json b/aurora/foundry/server/swagger3.json new file mode 100644 index 0000000..1f7ce6e --- /dev/null +++ b/aurora/foundry/server/swagger3.json @@ -0,0 +1,278 @@ +{ + "openapi": "3.1.0", + "info": { + "title": "Aurora Endpoint", + "version": "0.1.0" + }, + "paths": { + "/": { + "get": { + "summary": "Check if alive", + "operationId": "ServiceHealthCheck", + "description": "Simple health check endpoint to ensure the service is up at any given point.", + "responses": { + "200": { + "description": "If service is up and running, this response will be returned with the content 'Healthy'", + "content": { + "text/plain": { + "schema": { + "type": "string", + "examples": [ + "Healthy" + ] + } + } + } + }, + "default": { + "description": "The service failed to execute due to an error.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + } + } + } + }, + "/score/": { + "post": { + "summary": "Create Task", + "operationId": "create_item_score__post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateTask" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateTaskResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "get": { + "summary": "Fetch Result of a Task", + "operationId": "get_item_score_post", + "parameters": [ + { + "name": "taskId", + "in": "query", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Successfull Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Answer" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "CreateTaskResponse": { + "properties": { + "success": { + "type": "boolean", + "title": "Success" + }, + "message": { + "type": "string", + "title": "Message" + }, + "taskId": { + "type": "string", + "title": "Task ID" + } + } + }, + "ErrorResponse": { + "properties": { + "message": { + "type": "string" + } + } + }, + "Answer": { + "properties": { + "success": { + "type": "boolean", + "title": "Success" + }, + "message": { + "type": "string", + "title": "Message" + }, + "data": { + "$ref": "#/components/schemas/ProgressInfo", + "title": "Data" + } + }, + "type": "object", + "required": [ + "success", + "message", + "data" + ], + "title": "Answer" + }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array", + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError" + }, + "ProgressInfo": { + "properties": { + "kind": { + "const": "progress_info", + "title": "Kind" + }, + "uuid": { + "type": "string", + "title": "Uuid" + }, + "completed": { + "type": "boolean", + "title": "Completed" + }, + "progress_percentage": { + "type": "integer", + "title": "Progress Percentage" + }, + "error": { + "type": "boolean", + "title": "Error" + }, + "error_info": { + "type": "string", + "title": "Error Info" + } + }, + "type": "object", + "title": "ProgressInfo" + }, + "CreateTask": { + "properties": { + "model_name": { + "type": "string", + "title": "Model Name" + }, + "num_steps": { + "type": "integer", + "title": "Num Steps" + }, + "client_comm": { + "type": "string", + "title": "Client Comm" + }, + "host_comm": { + "type": "string", + "title": "Host Comm" + } + }, + "type": "object", + "required": [ + "request" + ], + "title": "CreateTask" + }, + "SubmissionInfo": { + "properties": { + "kind": { + "const": "submission_info", + "title": "Kind" + }, + "uuid": { + "type": "string", + "title": "Uuid" + } + }, + "type": "object", + "title": "SubmissionInfo" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + }, + "type": "array", + "title": "Location" + }, + "msg": { + "type": "string", + "title": "Message" + }, + "type": { + "type": "string", + "title": "Error Type" + } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError" + }, + "securitySchemes": { + "Bearer": { + "type": "apiKey", + "name": "Authorization", + "in": "header", + "description": "For example Bearer abc123" + } + } + } + } +} From 18efcdc41979acedc71340ef248803f15dd6b050 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Wed, 8 Jan 2025 23:32:27 +0100 Subject: [PATCH 04/44] place swagger correctly, incorporate feedback --- Dockerfile | 4 ++++ aurora/foundry/server/swagger3.json | 23 +++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3ba30b1..1155993 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,4 +35,8 @@ RUN . ./venv/bin/activate && \ EXPOSE 5001 ENV PORT=5001 +# we don't have a swagger2.json file, so we'll just "ignore" the version option and always return a version 3 file +RUN cp ./aurora/foundry/server/swagger3.json ./swagger3.json +RUN cp ./aurora/foundry/server/swagger3.json ./swagger2.json + CMD ["./venv/bin/azmlinfsrv", "--entry_script", "aurora/foundry/server/score.py"] diff --git a/aurora/foundry/server/swagger3.json b/aurora/foundry/server/swagger3.json index 1f7ce6e..e8db81a 100644 --- a/aurora/foundry/server/swagger3.json +++ b/aurora/foundry/server/swagger3.json @@ -4,6 +4,11 @@ "title": "Aurora Endpoint", "version": "0.1.0" }, + "security": [ + { + "Bearer": [] + } + ], "paths": { "/": { "get": { @@ -37,8 +42,13 @@ } } }, - "/score/": { + "/score": { "post": { + "security": [ + { + "Bearer": [] + } + ], "summary": "Create Task", "operationId": "create_item_score__post", "requestBody": { @@ -75,11 +85,16 @@ } }, "get": { - "summary": "Fetch Result of a Task", + "security": [ + { + "Bearer": [] + } + ], + "summary": "Fetch Progress of a Task", "operationId": "get_item_score_post", "parameters": [ { - "name": "taskId", + "name": "task_id", "in": "query", "required": true, "schema": { @@ -114,7 +129,7 @@ "type": "string", "title": "Message" }, - "taskId": { + "task_id": { "type": "string", "title": "Task ID" } From deb9955221ad46da2cc351cfe7d588f2bb129642 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Wed, 8 Jan 2025 23:36:13 +0100 Subject: [PATCH 05/44] minor --- Dockerfile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1155993..8d5a518 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,7 +36,10 @@ EXPOSE 5001 ENV PORT=5001 # we don't have a swagger2.json file, so we'll just "ignore" the version option and always return a version 3 file -RUN cp ./aurora/foundry/server/swagger3.json ./swagger3.json -RUN cp ./aurora/foundry/server/swagger3.json ./swagger2.json +RUN cp ./aurora/foundry/server/swagger3.json ./swagger2.json && \ + cp ./aurora/foundry/server/swagger3.json ./swagger2.0.json && \ + cp ./aurora/foundry/server/swagger3.json ./swagger3.1.json && \ + cp ./aurora/foundry/server/swagger3.json ./swagger3.0.json && \ + cp ./aurora/foundry/server/swagger3.json ./swagger3.json CMD ["./venv/bin/azmlinfsrv", "--entry_script", "aurora/foundry/server/score.py"] From fb30897686939cce28e59dc0ee48b4b5f382905f Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 00:20:56 +0100 Subject: [PATCH 06/44] reorganize Dockerfile for faster iteration --- Dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8d5a518..fe3c726 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,21 +16,21 @@ RUN pip install --upgrade pip virtualenv && \ RUN ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-small-pretrained.ckpt")' && \ ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-finetuned.ckpt")' -COPY ./aurora ./aurora COPY ./LICENSE.txt . COPY ./README.md . -ARG AURORA_REPO_VERSION -RUN [ ! -z "${AURORA_REPO_VERSION}" ] || { echo "AURORA_REPO_VERSION must be set."; exit 1; } && \ - . venv/bin/activate && \ - SETUPTOOLS_SCM_PRETEND_VERSION="$AURORA_REPO_VERSION" pip install -e . - # Install `azcopy` and the AML inference server. RUN wget https://aka.ms/downloadazcopy-v10-linux -O azcopy.tar.gz && \ cp $(tar -xvzf azcopy.tar.gz | grep azcopy$) /usr/local/bin/azcopy RUN . ./venv/bin/activate && \ pip install azureml-inference-server-http +COPY ./aurora ./aurora +ARG AURORA_REPO_VERSION +RUN [ ! -z "${AURORA_REPO_VERSION}" ] || { echo "AURORA_REPO_VERSION must be set."; exit 1; } && \ + . venv/bin/activate && \ + SETUPTOOLS_SCM_PRETEND_VERSION="$AURORA_REPO_VERSION" pip install -e . + # Make port 5001 available to the world outside this container. EXPOSE 5001 ENV PORT=5001 From 5ffd9f3c367b1062025bebfba65c841d4be5639a Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 00:21:24 +0100 Subject: [PATCH 07/44] dispatch based on HTTP method --- aurora/foundry/client/foundry.py | 2 +- aurora/foundry/server/score.py | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/aurora/foundry/client/foundry.py b/aurora/foundry/client/foundry.py index 09e21e8..a115880 100644 --- a/aurora/foundry/client/foundry.py +++ b/aurora/foundry/client/foundry.py @@ -56,6 +56,6 @@ def _req( ) def score(self, data: dict) -> dict: - answer = self._req("POST", "score", {"data": data}) + answer = self._req("POST", "score", data) answer.raise_for_status() return answer.json() diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index dddc701..f76c226 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -6,6 +6,9 @@ from concurrent.futures import ThreadPoolExecutor from typing import Literal, Union from uuid import uuid4 +from azureml_inference_server_http.api.aml_response import AMLResponse +from azureml_inference_server_http.api.aml_request import AMLRequest +from azureml_inference_server_http.api.aml_request import rawhttp from pydantic import BaseModel, Field @@ -85,7 +88,8 @@ def init() -> None: POOL.__enter__() -def run(raw_data: str) -> dict: +@rawhttp +def run(input_data: AMLRequest) -> dict: """Perform predictions. Args: @@ -95,12 +99,12 @@ def run(raw_data: str) -> dict: dict: Answer, which will be encoded as JSON. """ logger.info("Received request.") - raw = json.loads(raw_data) - request = Request(**raw["data"]).request - - if isinstance(request, Submission): + if input_data.method == "POST": logger.info("Submitting new task to thread pool.") - task = Task(request) + try: + task = Task(**input_data.get_json()) + except Exception as exc: + return AMLResponse(dict(message=str(exc)), 500, {}, json_str=True) POOL.submit(task) TASKS[task.uuid] = task return { @@ -112,9 +116,9 @@ def run(raw_data: str) -> dict: }, } - elif isinstance(request, Check): + elif input_data.method == "GET": logger.info("Returning the status of an existing task.") - uuid = request.uuid + uuid = input_data.args.get("task_id") time.sleep(1) # Sleep here, so the client does not need to. if uuid not in TASKS: return { @@ -138,7 +142,4 @@ def run(raw_data: str) -> dict: else: # This branch should be unreachable. - return { - "success": False, - "message": "Invalid action.", - } + return AMLRequest(dict(message="Method not allowed."), 405, {}, json_str=True) From 00e49aa4d08a82305496a34420ad2fabeb7df78f Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 00:28:41 +0100 Subject: [PATCH 08/44] minor --- aurora/foundry/server/score.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index f76c226..63f05eb 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -1,6 +1,5 @@ """Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" -import json import logging import time from concurrent.futures import ThreadPoolExecutor @@ -28,7 +27,6 @@ class Submission(BaseModel): - action: Literal["submit"] host_comm: CommSpecs = Field(..., discriminator="class_name") model_name: str num_steps: int @@ -39,10 +37,6 @@ class Check(BaseModel): uuid: str -class Request(BaseModel): - request: Union[Submission, Check] = Field(..., discriminator="action") - - POOL = ThreadPoolExecutor(max_workers=1) TASKS = dict() @@ -102,7 +96,7 @@ def run(input_data: AMLRequest) -> dict: if input_data.method == "POST": logger.info("Submitting new task to thread pool.") try: - task = Task(**input_data.get_json()) + task = Task(Submission(**input_data.get_json())) except Exception as exc: return AMLResponse(dict(message=str(exc)), 500, {}, json_str=True) POOL.submit(task) @@ -119,7 +113,8 @@ def run(input_data: AMLRequest) -> dict: elif input_data.method == "GET": logger.info("Returning the status of an existing task.") uuid = input_data.args.get("task_id") - time.sleep(1) # Sleep here, so the client does not need to. + if not uuid: + return AMLRequest(dict(message="Missing task_id query parameter."), 400, {}, json_str=True) if uuid not in TASKS: return { "success": False, @@ -127,6 +122,12 @@ def run(input_data: AMLRequest) -> dict: } else: task = TASKS[uuid] + # Allow the task some time to complete. + # We sleep here so the client does not query too frequently. + for _ in range(3): + if task.completed: + break + time.sleep(1) return { "success": True, "message": "Status check completed.", From 7eee7ed97afb52ea34a66a386afe023f521dd20e Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 00:38:15 +0100 Subject: [PATCH 09/44] docstr --- aurora/foundry/server/score.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index 63f05eb..3a4007b 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -87,10 +87,10 @@ def run(input_data: AMLRequest) -> dict: """Perform predictions. Args: - raw_data (str): Request as JSON. + input_data (AMLRequest): Mostly a Flask Request object. Returns: - dict: Answer, which will be encoded as JSON. + dict/AMLResponse: The response to the request. dicts are implictitly 200 AMLResponses. """ logger.info("Received request.") if input_data.method == "POST": From f0af8d0cec0de33352bb8e9336af2ad9bcfe313e Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 09:35:44 +0100 Subject: [PATCH 10/44] linting --- aurora/foundry/server/score.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index 3a4007b..f895444 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -5,10 +5,9 @@ from concurrent.futures import ThreadPoolExecutor from typing import Literal, Union from uuid import uuid4 -from azureml_inference_server_http.api.aml_response import AMLResponse -from azureml_inference_server_http.api.aml_request import AMLRequest -from azureml_inference_server_http.api.aml_request import rawhttp +from azureml_inference_server_http.api.aml_request import AMLRequest, rawhttp +from azureml_inference_server_http.api.aml_response import AMLResponse from pydantic import BaseModel, Field from aurora.foundry.common.channel import ( @@ -114,7 +113,12 @@ def run(input_data: AMLRequest) -> dict: logger.info("Returning the status of an existing task.") uuid = input_data.args.get("task_id") if not uuid: - return AMLRequest(dict(message="Missing task_id query parameter."), 400, {}, json_str=True) + return AMLRequest( + dict(message="Missing task_id query parameter."), + 400, + {}, + json_str=True, + ) if uuid not in TASKS: return { "success": False, From 26a9a983f09d20e7a7baf39b2d2dfdc9de95c8b9 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 10:18:22 +0100 Subject: [PATCH 11/44] update api.py --- aurora/foundry/client/api.py | 27 +++++++-------------------- aurora/foundry/client/foundry.py | 22 +++++++++++++++++++--- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index 2159889..71e2755 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -73,23 +73,18 @@ def submit( # Send a request to the endpoint to produce the predictions. data = { "request": { - "action": "submit", "model_name": model_name, "num_steps": num_steps, "host_comm": host_comm.to_spec(), } } - answer = Answer(**foundry_client.score(data)) - if not answer.success: - raise SubmissionError(answer.message) - submission_info = answer.data - if not isinstance(submission_info, SubmissionInfo): - raise SubmissionError( - "Server returned no submission information. " - "Cannot determine task UUID to track tasks." - ) + response = foundry_client.submit_task(data) + try: + submission_info = SubmissionInfo(**response) + except Exception as e: + raise SubmissionError(response["message"]) from e task_uuid = submission_info.uuid - logger.info("Submitted request to endpoint.") + logger.info("Submitted task %r to endpoint.", task_uuid) # Send the initial condition over. client_comm.send(batch, task_uuid, "input.nc") @@ -99,15 +94,7 @@ def submit( while True: # Check on the progress of the task. data = {"request": {"action": "check", "uuid": task_uuid}} - answer = Answer(**foundry_client.score(data)) - if not answer.success: - raise SubmissionError(answer.message) - progress_info = answer.data - if not isinstance(progress_info, ProgressInfo): - raise SubmissionError( - "Server returned no progress information. " - "Cannot determine whether the task has been completed or not." - ) + progress_info = ProgressInfo(**foundry_client.get_progress(task_uuid)) if progress_info.error: raise SubmissionError(f"Task failed: {progress_info.error_info}") diff --git a/aurora/foundry/client/foundry.py b/aurora/foundry/client/foundry.py index a115880..21c46f5 100644 --- a/aurora/foundry/client/foundry.py +++ b/aurora/foundry/client/foundry.py @@ -17,14 +17,25 @@ class AbstractFoundryClient(metaclass=abc.ABCMeta): """A client to talk to Azure AI Foundry.""" @abc.abstractmethod - def score(self, data: dict) -> dict: + def submit_task(self, data: dict) -> dict: """Send `data` to the scoring path. Args: data (dict): Data to send. Returns: - dict: Answer. + dict: SubmissionInfo. + """ + + @abc.abstractmethod + def get_progress(self, task_id: str) -> dict: + """Get the progress of the task. + + Args: + task_id (str): Task ID to get progress info for. + + Returns: + dict: ProgressInfo. """ @@ -55,7 +66,12 @@ def _req( json=data, ) - def score(self, data: dict) -> dict: + def submit_task(self, data: dict) -> dict: answer = self._req("POST", "score", data) answer.raise_for_status() return answer.json() + + def get_progress(self, task_id: str) -> dict: + answer = self._req("GET", f"score?task_id={task_id}") + answer.raise_for_status() + return answer.json() From bc3a19aab54c68185382a8c86bd6d63d35fbf975 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 10:19:23 +0100 Subject: [PATCH 12/44] simplify --- aurora/foundry/client/api.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index 71e2755..79afe6e 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -71,14 +71,12 @@ def submit( raise KeyError(f"Model `{model_name}` is not a valid model.") # Send a request to the endpoint to produce the predictions. - data = { - "request": { - "model_name": model_name, - "num_steps": num_steps, - "host_comm": host_comm.to_spec(), - } + task = { + "model_name": model_name, + "num_steps": num_steps, + "host_comm": host_comm.to_spec(), } - response = foundry_client.submit_task(data) + response = foundry_client.submit_task(task) try: submission_info = SubmissionInfo(**response) except Exception as e: @@ -93,7 +91,6 @@ def submit( while True: # Check on the progress of the task. - data = {"request": {"action": "check", "uuid": task_uuid}} progress_info = ProgressInfo(**foundry_client.get_progress(task_uuid)) if progress_info.error: From 70371381b5abeaeaa93c406cde601f08a25fc3fd Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 11:17:09 +0100 Subject: [PATCH 13/44] WIP --- pyproject.toml | 1 + tests/foundry/conftest.py | 203 ++++++++++++++++++++------------------ 2 files changed, 108 insertions(+), 96 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6a3c2bd..8d773ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dev = [ "mypy", "ruff==0.4.1", "pre-commit", + "requests_mock", "jupyter-book", "scipy", ] diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index a5c6f53..a5120e0 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -3,22 +3,61 @@ import json import subprocess import time +from contextlib import contextmanager from pathlib import Path -from typing import Callable, Generator +from typing import Generator import pytest import requests -from aurora.foundry.client.foundry import AbstractFoundryClient +from aurora.foundry.client.foundry import FoundryClient from aurora.foundry.common.channel import BlobStorageCommunication, LocalCommunication - -class MockFoundryClient(AbstractFoundryClient): - def __init__(self, f: Callable[[dict], dict]): - self.f = f - - def score(self, data: dict) -> dict: - return self.f(data) +MOCK_ADDRESS = "https://mock-foundry.azurewebsites.net" + + +@contextmanager +def runner_process(): + score_script_path = Path(__file__).parents[2] / "aurora/foundry/server/score.py" + runner_path = Path(__file__).parents[0] / "runner.py" + p = subprocess.Popen( + ["python", runner_path, score_script_path], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + stdin = p.stdin + stdout = p.stdout + assert stdin is not None and stdout is not None + yield p, stdin, stdout + p.terminate() + p.wait() + + +@contextmanager +def mock_foundry_responses_subprocess(stdin, stdout, requests_mock, base_address=MOCK_ADDRESS): + def _mock_send(request, context) -> dict: + method = request.method.encode("unicode_escape") + stdin.write(method + b"\n") + stdin.write(method.url + b"\n") + stdin.write(request.text().encode("unicode_escape") + b"\n") + stdin.flush() + + output = stdout.readline() + if not output: + raise RuntimeError("Runner returned no answer. It likely crashed.") + + return json.loads(output.decode("unicode_escape")) + + task_uuid = "mock-uuid" + requests_mock.post( + f"{base_address}/score", + json=_mock_send, + ) + requests_mock.get( + f"{base_address}/score?uuid={task_uuid}", + json=_mock_send, + ) + yield @pytest.fixture( @@ -31,69 +70,47 @@ def score(self, data: dict) -> dict: def mock_foundry_client( request, monkeypatch, + requests_mock, tmp_path: Path, - capsys, ) -> Generator[dict, None, None]: + azcopy_mock_work_dir = tmp_path / "azcopy_work" + if "subprocess" in request.param: # Already determine a possible working path for the mock of `azcopy`. It might not be used, # but we do already need to determine it. - azcopy_mock_work_dir = tmp_path / "azcopy_work" - - # Create a subprocess that mocks the runner. - score_script_path = Path(__file__).parents[2] / "aurora/foundry/server/score.py" - runner_path = Path(__file__).parents[0] / "runner.py" - p = subprocess.Popen( - ["python", runner_path, azcopy_mock_work_dir, score_script_path], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - ) - stdin = p.stdin - stdout = p.stdout - assert stdin is not None and stdout is not None - - def _mock_send(message: dict) -> dict: - # Message will be wrapped into the field `data`. - stdin.write(json.dumps({"data": message}).encode("unicode_escape")) - stdin.write(b"\n") - stdin.flush() - - output = stdout.readline() - if not output: - raise RuntimeError("Runner returned no answer. It likely crashed.") - return json.loads(output.decode("unicode_escape")) - - # Now we decide whether we do communication locally or via blob storage. If we do - # communication via blob storage, we must mock `azcopy` too. - comm_folder = tmp_path / "communication" - - if "local" in request.param: - # Communicate via a local folder. - yield { - "client_comm": LocalCommunication(comm_folder), - "host_comm": LocalCommunication(comm_folder), - "foundry_client": MockFoundryClient(_mock_send), - } - - else: - # Communicate via blob storage, so mock `azcopy` too. - azcopy_path = Path(__file__).parents[0] / "azcopy.py" - monkeypatch.setattr( - BlobStorageCommunication, - "_AZCOPY_EXECUTABLE", - ["python", str(azcopy_path), str(azcopy_mock_work_dir)], - ) - # The below test URL must start with `https`! - blob_url_with_sas = "https://storageaccount.blob.core.windows.net/container/folder?SAS" - yield { - "client_comm": BlobStorageCommunication(blob_url_with_sas), - "host_comm": BlobStorageCommunication(blob_url_with_sas), - "foundry_client": MockFoundryClient(_mock_send), - } - - # Kill the process upon teardown. - p.terminate() - p.wait() + with runner_process() as (p, stdin, stdout), mock_foundry_responses_subprocess( + stdin, stdout, requests_mock + ): + # Now we decide whether we do communication locally or via blob storage. If we do + # communication via blob storage, we must mock `azcopy` too. + comm_folder = tmp_path / "communication" + + if "local" in request.param: + # Communicate via a local folder. + yield { + "client_comm": LocalCommunication(comm_folder), + "host_comm": LocalCommunication(comm_folder), + "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), + } + + else: + # Communicate via blob storage, so mock `azcopy` too. + azcopy_path = Path(__file__).parents[0] / "azcopy.py" + monkeypatch.setattr( + BlobStorageCommunication, + "_AZCOPY_EXECUTABLE", + ["python", str(azcopy_path), str(azcopy_mock_work_dir)], + ) + # The below test URL must start with `https`! + blob_url_with_sas = ( + "https://storageaccount.blob.core.windows.net/container/folder?SAS" + ) + yield { + "client_comm": BlobStorageCommunication(blob_url_with_sas), + "host_comm": BlobStorageCommunication(blob_url_with_sas), + "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), + } elif request.param == "docker-local": client_comm_folder = tmp_path / "communication" @@ -116,37 +133,31 @@ def _mock_send(message: dict) -> dict: "aurora-foundry:latest", ], ) + try: + # Wait for the server to come online. + start = time.time() + while True: + try: + res = requests.get("http://127.0.0.1:5001/") + res.raise_for_status() + except (requests.ConnectionError, requests.HTTPError) as e: + # Try for at most 10 seconds. + if time.time() - start < 10: + time.sleep(0.5) + continue + else: + raise e + break + + yield { + "client_comm": LocalCommunication(client_comm_folder), + "host_comm": LocalCommunication("/communication"), + "foundry_client": FoundryClient("https://127.0.0.1:5001", "mock-token"), + } - # Wait for the server to come online. - start = time.time() - while True: - try: - res = requests.get("http://127.0.0.1:5001/") - res.raise_for_status() - except (requests.ConnectionError, requests.HTTPError) as e: - # Try for at most 10 seconds. - if time.time() - start < 10: - time.sleep(0.5) - continue - else: - raise e - break - - def _mock_send(message: dict) -> dict: - # The message will be wrapped in a field `data`. - res = requests.post("http://127.0.0.1:5001/score", data=json.dumps({"data": message})) - res.raise_for_status() - return json.loads(res.text) - - yield { - "client_comm": LocalCommunication(client_comm_folder), - "host_comm": LocalCommunication("/communication"), - "foundry_client": MockFoundryClient(_mock_send), - } - - # Kill the container upon teardown. - p.terminate() - p.wait() + finally: + p.terminate() + p.wait() else: raise ValueError(f"Bad Foundry mock mode: `{request.param}`.") From cb2f8cc56666881399c2e8bc7f08be37a7bbff5e Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 12:13:44 +0100 Subject: [PATCH 14/44] WIP --- aurora/foundry/client/api.py | 13 +++---------- aurora/foundry/server/score.py | 7 +------ 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index 79afe6e..a597cf6 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -4,9 +4,9 @@ """ import logging -from typing import Generator, Literal, Optional, Union +from typing import Generator, Literal -from pydantic import BaseModel, Field +from pydantic import BaseModel from aurora import Batch from aurora.foundry.client.foundry import AbstractFoundryClient @@ -19,8 +19,7 @@ class SubmissionInfo(BaseModel): - kind: Literal["submission_info"] - uuid: str + task_id: str class ProgressInfo(BaseModel): @@ -32,12 +31,6 @@ class ProgressInfo(BaseModel): error_info: str -class Answer(BaseModel): - success: bool - message: str - data: Optional[Union[SubmissionInfo, ProgressInfo]] = Field(..., discriminator="kind") - - class SubmissionError(Exception): """The submission could not be completed for some reason.""" diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index f895444..04d8ecb 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -101,12 +101,7 @@ def run(input_data: AMLRequest) -> dict: POOL.submit(task) TASKS[task.uuid] = task return { - "success": True, - "message": "Request has been succesfully submitted.", - "data": { - "kind": "submission_info", - "uuid": task.uuid, - }, + "task_id": task.uuid, } elif input_data.method == "GET": From d0fe591beb1643b4f81e53cedd82ad98deac4ef7 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 12:14:11 +0100 Subject: [PATCH 15/44] WIP --- tests/foundry/conftest.py | 12 +++++++----- tests/foundry/runner.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index a5120e0..97b4561 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -17,11 +17,11 @@ @contextmanager -def runner_process(): +def runner_process(azcopy_mock_work_dir): score_script_path = Path(__file__).parents[2] / "aurora/foundry/server/score.py" runner_path = Path(__file__).parents[0] / "runner.py" p = subprocess.Popen( - ["python", runner_path, score_script_path], + ["python", runner_path, azcopy_mock_work_dir, score_script_path], stdin=subprocess.PIPE, stdout=subprocess.PIPE, ) @@ -38,8 +38,10 @@ def mock_foundry_responses_subprocess(stdin, stdout, requests_mock, base_address def _mock_send(request, context) -> dict: method = request.method.encode("unicode_escape") stdin.write(method + b"\n") - stdin.write(method.url + b"\n") - stdin.write(request.text().encode("unicode_escape") + b"\n") + stdin.write(request.path.encode("unicode_escape") + b"\n") + stdin.write(json.dumps(request.qs).encode("unicode_escape") + b"\n") + stdin.write(json.dumps(dict(request.headers)).encode("unicode_escape") + b"\n") + stdin.write(request.text.encode("unicode_escape") + b"\n") stdin.flush() output = stdout.readline() @@ -79,7 +81,7 @@ def mock_foundry_client( # Already determine a possible working path for the mock of `azcopy`. It might not be used, # but we do already need to determine it. - with runner_process() as (p, stdin, stdout), mock_foundry_responses_subprocess( + with runner_process(azcopy_mock_work_dir) as (p, stdin, stdout), mock_foundry_responses_subprocess( stdin, stdout, requests_mock ): # Now we decide whether we do communication locally or via blob storage. If we do diff --git a/tests/foundry/runner.py b/tests/foundry/runner.py index 53dee9e..2806d6c 100644 --- a/tests/foundry/runner.py +++ b/tests/foundry/runner.py @@ -8,6 +8,9 @@ import logging import sys from pathlib import Path +from flask import Request +from werkzeug.test import EnvironBuilder +from werkzeug.wrappers import Request as WerkzeugRequest import click @@ -52,12 +55,31 @@ def main(azcopy_mock_work_path: Path, path: Path) -> None: score.init() while True: - raw_data = sys.stdin.readline() - raw_data = raw_data.encode("utf-8").decode("unicode_escape") + method = sys.stdin.readline().strip() + base_url = sys.stdin.readline().strip() + query_params = json.loads(sys.stdin.readline().encode("utf-8").strip()) + headers = json.loads(sys.stdin.readline().encode("utf-8").strip()) + payload = sys.stdin.readline().encode("utf-8").strip() - answer = json.dumps(score.run(raw_data)) + builder = EnvironBuilder( + method=method, + base_url=base_url, + headers={ + "Content-Type": "application/json" + }, + data=payload, + ) + env = builder.get_environ() + flask_request = Request(env) - sys.stdout.write(answer.encode("unicode_escape").decode("utf-8")) + resp = score.run(flask_request) + if isinstance(resp, dict): + answer = json.dumps(resp).encode("utf-8") + else: + answer = resp.data + print("DATA", answer) + + sys.stdout.write(answer.decode("utf-8")) sys.stdout.write("\n") sys.stdout.flush() From 27a8a646f5792c1339273a054deed427e9682026 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 12:42:02 +0100 Subject: [PATCH 16/44] api tests passing --- aurora/foundry/client/api.py | 14 +++++++------- aurora/foundry/server/score.py | 19 +++++++------------ tests/foundry/conftest.py | 10 ++++++---- tests/foundry/runner.py | 8 +++----- 4 files changed, 23 insertions(+), 28 deletions(-) diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index a597cf6..af7eb3a 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -23,8 +23,7 @@ class SubmissionInfo(BaseModel): class ProgressInfo(BaseModel): - kind: Literal["progress_info"] - uuid: str + task_id: str completed: bool progress_percentage: int error: bool @@ -74,17 +73,18 @@ def submit( submission_info = SubmissionInfo(**response) except Exception as e: raise SubmissionError(response["message"]) from e - task_uuid = submission_info.uuid - logger.info("Submitted task %r to endpoint.", task_uuid) + task_id = submission_info.task_id + logger.info("Submitted task %r to endpoint.", task_id) # Send the initial condition over. - client_comm.send(batch, task_uuid, "input.nc") + client_comm.send(batch, task_id, "input.nc") previous_progress: int = 0 while True: # Check on the progress of the task. - progress_info = ProgressInfo(**foundry_client.get_progress(task_uuid)) + response = foundry_client.get_progress(task_id) + progress_info = ProgressInfo(**response) if progress_info.error: raise SubmissionError(f"Task failed: {progress_info.error_info}") @@ -99,4 +99,4 @@ def submit( logger.info("Retrieving predictions.") for prediction_name in iterate_prediction_files("prediction.nc", num_steps): - yield client_comm.receive(task_uuid, prediction_name) + yield client_comm.receive(task_id, prediction_name) diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index 04d8ecb..a90681c 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -108,7 +108,7 @@ def run(input_data: AMLRequest) -> dict: logger.info("Returning the status of an existing task.") uuid = input_data.args.get("task_id") if not uuid: - return AMLRequest( + return AMLResponse( dict(message="Missing task_id query parameter."), 400, {}, @@ -128,18 +128,13 @@ def run(input_data: AMLRequest) -> dict: break time.sleep(1) return { - "success": True, - "message": "Status check completed.", - "data": { - "kind": "progress_info", - "uuid": uuid, - "completed": task.completed, - "progress_percentage": task.progress_percentage, - "error": task.exc is not None, - "error_info": str(task.exc) if task.exc else "", - }, + "task_id": uuid, + "completed": task.completed, + "progress_percentage": task.progress_percentage, + "error": task.exc is not None, + "error_info": str(task.exc) if task.exc else "", } else: # This branch should be unreachable. - return AMLRequest(dict(message="Method not allowed."), 405, {}, json_str=True) + return AMLResponse(dict(message="Method not allowed."), 405, {}, json_str=True) diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 97b4561..d91d41c 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -1,6 +1,7 @@ """Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" import json +import re import subprocess import time from contextlib import contextmanager @@ -37,11 +38,12 @@ def runner_process(azcopy_mock_work_dir): def mock_foundry_responses_subprocess(stdin, stdout, requests_mock, base_address=MOCK_ADDRESS): def _mock_send(request, context) -> dict: method = request.method.encode("unicode_escape") + text = request.text or "" stdin.write(method + b"\n") stdin.write(request.path.encode("unicode_escape") + b"\n") - stdin.write(json.dumps(request.qs).encode("unicode_escape") + b"\n") + stdin.write(request.url.partition('?')[2].encode("unicode_escape") + b"\n") stdin.write(json.dumps(dict(request.headers)).encode("unicode_escape") + b"\n") - stdin.write(request.text.encode("unicode_escape") + b"\n") + stdin.write(text.encode("unicode_escape") + b"\n") stdin.flush() output = stdout.readline() @@ -50,13 +52,12 @@ def _mock_send(request, context) -> dict: return json.loads(output.decode("unicode_escape")) - task_uuid = "mock-uuid" requests_mock.post( f"{base_address}/score", json=_mock_send, ) requests_mock.get( - f"{base_address}/score?uuid={task_uuid}", + re.compile(f"{base_address}/score\?task_id=.*"), json=_mock_send, ) yield @@ -115,6 +116,7 @@ def mock_foundry_client( } elif request.param == "docker-local": + requests_mock.real_http = True client_comm_folder = tmp_path / "communication" # It's important to create the communication folder on the client side already. If we don't, diff --git a/tests/foundry/runner.py b/tests/foundry/runner.py index 2806d6c..86a6b9c 100644 --- a/tests/foundry/runner.py +++ b/tests/foundry/runner.py @@ -57,16 +57,15 @@ def main(azcopy_mock_work_path: Path, path: Path) -> None: while True: method = sys.stdin.readline().strip() base_url = sys.stdin.readline().strip() - query_params = json.loads(sys.stdin.readline().encode("utf-8").strip()) + query_params = sys.stdin.readline().strip() headers = json.loads(sys.stdin.readline().encode("utf-8").strip()) payload = sys.stdin.readline().encode("utf-8").strip() builder = EnvironBuilder( method=method, base_url=base_url, - headers={ - "Content-Type": "application/json" - }, + query_string=query_params, + headers=headers, data=payload, ) env = builder.get_environ() @@ -77,7 +76,6 @@ def main(azcopy_mock_work_path: Path, path: Path) -> None: answer = json.dumps(resp).encode("utf-8") else: answer = resp.data - print("DATA", answer) sys.stdout.write(answer.decode("utf-8")) sys.stdout.write("\n") From b8f473aa88a5a9f2dc4495dcfbde817300817d43 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 14:12:49 +0100 Subject: [PATCH 17/44] fix --- aurora/foundry/client/foundry.py | 2 +- pyproject.toml | 1 - tests/foundry/conftest.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/aurora/foundry/client/foundry.py b/aurora/foundry/client/foundry.py index 21c46f5..2d13bea 100644 --- a/aurora/foundry/client/foundry.py +++ b/aurora/foundry/client/foundry.py @@ -58,7 +58,7 @@ def _req( ) -> requests.Response: return requests.request( method, - os.path.join(self.endpoint, path), + self.endpoint.rstrip('/') + '/' + path.lstrip('/'), headers={ "Authorization": f"Bearer {self.token}", "Content-Type": "application/json", diff --git a/pyproject.toml b/pyproject.toml index 8d773ef..6a3c2bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,6 @@ dev = [ "mypy", "ruff==0.4.1", "pre-commit", - "requests_mock", "jupyter-book", "scipy", ] diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index d91d41c..5144d81 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -156,7 +156,7 @@ def mock_foundry_client( yield { "client_comm": LocalCommunication(client_comm_folder), "host_comm": LocalCommunication("/communication"), - "foundry_client": FoundryClient("https://127.0.0.1:5001", "mock-token"), + "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), } finally: From afdc6a45cf1ca11f269e8b3f4f44ef91baa1bec8 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 15:13:09 +0100 Subject: [PATCH 18/44] minor --- pyproject.toml | 1 + tests/foundry/runner.py | 20 ++++++++------------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6a3c2bd..8d773ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dev = [ "mypy", "ruff==0.4.1", "pre-commit", + "requests_mock", "jupyter-book", "scipy", ] diff --git a/tests/foundry/runner.py b/tests/foundry/runner.py index 86a6b9c..bc6a809 100644 --- a/tests/foundry/runner.py +++ b/tests/foundry/runner.py @@ -8,11 +8,10 @@ import logging import sys from pathlib import Path -from flask import Request -from werkzeug.test import EnvironBuilder -from werkzeug.wrappers import Request as WerkzeugRequest import click +from flask import Request +from werkzeug.test import EnvironBuilder # Expose logging messages. logger = logging.getLogger() @@ -62,20 +61,17 @@ def main(azcopy_mock_work_path: Path, path: Path) -> None: payload = sys.stdin.readline().encode("utf-8").strip() builder = EnvironBuilder( - method=method, - base_url=base_url, - query_string=query_params, - headers=headers, - data=payload, + method=method, + base_url=base_url, + query_string=query_params, + headers=headers, + data=payload, ) env = builder.get_environ() flask_request = Request(env) resp = score.run(flask_request) - if isinstance(resp, dict): - answer = json.dumps(resp).encode("utf-8") - else: - answer = resp.data + answer = json.dumps(resp).encode("utf-8") if isinstance(resp, dict) else resp.data sys.stdout.write(answer.decode("utf-8")) sys.stdout.write("\n") From 30c7e35da16b4c5b4ba9b0cacb61530d273d5a52 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Thu, 9 Jan 2025 17:46:47 +0100 Subject: [PATCH 19/44] Add missing dev dependencies --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8d773ef..a59b511 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,12 +52,13 @@ dev = [ "ghp-import", "pytest", "pytest-cov", + "requests_mock", "mypy", "ruff==0.4.1", "pre-commit", - "requests_mock", "jupyter-book", - "scipy", + "flask", + "azureml-inference-server-http", ] [project.urls] From c173425a1df4ed88291bc873dba4edd509651b29 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Thu, 9 Jan 2025 17:49:05 +0100 Subject: [PATCH 20/44] Fix formatting --- aurora/foundry/client/api.py | 2 +- aurora/foundry/client/foundry.py | 3 +-- tests/foundry/conftest.py | 12 +++++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index af7eb3a..b3545b9 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -4,7 +4,7 @@ """ import logging -from typing import Generator, Literal +from typing import Generator from pydantic import BaseModel diff --git a/aurora/foundry/client/foundry.py b/aurora/foundry/client/foundry.py index 2d13bea..6c40c5e 100644 --- a/aurora/foundry/client/foundry.py +++ b/aurora/foundry/client/foundry.py @@ -2,7 +2,6 @@ import abc import logging -import os from typing import Literal import requests @@ -58,7 +57,7 @@ def _req( ) -> requests.Response: return requests.request( method, - self.endpoint.rstrip('/') + '/' + path.lstrip('/'), + self.endpoint.rstrip("/") + "/" + path.lstrip("/"), headers={ "Authorization": f"Bearer {self.token}", "Content-Type": "application/json", diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 5144d81..6b964f0 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -41,7 +41,7 @@ def _mock_send(request, context) -> dict: text = request.text or "" stdin.write(method + b"\n") stdin.write(request.path.encode("unicode_escape") + b"\n") - stdin.write(request.url.partition('?')[2].encode("unicode_escape") + b"\n") + stdin.write(request.url.partition("?")[2].encode("unicode_escape") + b"\n") stdin.write(json.dumps(dict(request.headers)).encode("unicode_escape") + b"\n") stdin.write(text.encode("unicode_escape") + b"\n") stdin.flush() @@ -57,7 +57,7 @@ def _mock_send(request, context) -> dict: json=_mock_send, ) requests_mock.get( - re.compile(f"{base_address}/score\?task_id=.*"), + re.compile(rf"{base_address}/score\?task_id=.*"), json=_mock_send, ) yield @@ -82,9 +82,11 @@ def mock_foundry_client( # Already determine a possible working path for the mock of `azcopy`. It might not be used, # but we do already need to determine it. - with runner_process(azcopy_mock_work_dir) as (p, stdin, stdout), mock_foundry_responses_subprocess( - stdin, stdout, requests_mock - ): + with runner_process(azcopy_mock_work_dir) as ( + p, + stdin, + stdout, + ), mock_foundry_responses_subprocess(stdin, stdout, requests_mock): # Now we decide whether we do communication locally or via blob storage. If we do # communication via blob storage, we must mock `azcopy` too. comm_folder = tmp_path / "communication" From 06ee747a74525871b63ce1d3e23a7dc1fb23c302 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Thu, 9 Jan 2025 21:11:19 +0100 Subject: [PATCH 21/44] Test blob storage protocol also for Docker image --- aurora/foundry/common/channel.py | 23 +------- aurora/foundry/server/_hook.py | 5 ++ aurora/foundry/server/score.py | 1 + tests/foundry/azcopy.py | 13 ++++- tests/foundry/conftest.py | 90 +++++++++++++++++++++++++++-- tests/foundry/docker_server_hook.py | 49 ++++++++++++++++ tests/foundry/runner.py | 75 ++++++++++++++++-------- 7 files changed, 204 insertions(+), 52 deletions(-) create mode 100644 aurora/foundry/server/_hook.py create mode 100644 tests/foundry/docker_server_hook.py diff --git a/aurora/foundry/common/channel.py b/aurora/foundry/common/channel.py index 9b394a9..83625c4 100644 --- a/aurora/foundry/common/channel.py +++ b/aurora/foundry/common/channel.py @@ -1,7 +1,6 @@ """Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" import abc -import json import logging import os import subprocess @@ -10,6 +9,7 @@ from pathlib import Path from typing import Generator, Literal +import requests from pydantic import BaseModel, HttpUrl from aurora import Batch @@ -217,25 +217,8 @@ def _mark(self, name: str) -> None: self._azcopy(["copy", str(mark_file_path), self._blob_path(f"{name}.finished")]) def _is_marked(self, name: str) -> bool: - out = json.loads( - self._azcopy( - [ - "list", - self._blob_path(f"{name}.finished"), - "--output-type", - "json", - "--output-level", - "essential", - ] - ) - ) - return ( - len(out) == 2 - and isinstance(out[0], dict) - and out[0].get("MessageType", None) == "ListObject" - and isinstance(out[1], dict) - and out[1].get("MessageType", None) == "EndOfJob" - ) + res = requests.head(self._blob_path(f"{name}.finished")) + return res.status_code == 200 def iterate_prediction_files(name: str, num_steps: int) -> Generator[str, None, None]: diff --git a/aurora/foundry/server/_hook.py b/aurora/foundry/server/_hook.py new file mode 100644 index 0000000..b8c2dff --- /dev/null +++ b/aurora/foundry/server/_hook.py @@ -0,0 +1,5 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license. + +The contents of this file can be replaced to modify the server. This is used to test the Docker +image. +""" diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index a90681c..012e180 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -10,6 +10,7 @@ from azureml_inference_server_http.api.aml_response import AMLResponse from pydantic import BaseModel, Field +import aurora.foundry.server._hook # noqa: F401 from aurora.foundry.common.channel import ( BlobStorageCommunication, LocalCommunication, diff --git a/tests/foundry/azcopy.py b/tests/foundry/azcopy.py index 1ea2368..0ce57cb 100644 --- a/tests/foundry/azcopy.py +++ b/tests/foundry/azcopy.py @@ -5,11 +5,19 @@ import json import logging +import os import shutil +import subprocess import sys from pathlib import Path -import click +try: + import click +except ImportError: + # This might be run in the release Docker image, where `click` won't be available. Just install + # it. + subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "click"]) + import click # Expose logging messages. logger = logging.getLogger() @@ -65,6 +73,9 @@ def main(work_path: Path, args: tuple[str, ...]) -> None: target = _parse_path(args[2], work_path) target.parent.mkdir(parents=True, exist_ok=True) shutil.copy(source, target) + # If this is run from within the release Docker image, we need to give others execution + # permissions to copy and load the file on the client side. + os.chmod(target, 0o755) else: raise RuntimeError(f"Unknown command `{args[0]}`.") diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 6b964f0..0af5c0b 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -68,6 +68,7 @@ def _mock_send(request, context) -> dict: "subprocess-local", "subprocess-blob", "docker-local", + "docker-blob", ] ) def mock_foundry_client( @@ -111,21 +112,48 @@ def mock_foundry_client( blob_url_with_sas = ( "https://storageaccount.blob.core.windows.net/container/folder?SAS" ) + + def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + if "blob.core.windows.net/" in request.url: + # Split off the SAS token. + path, _ = request.url.split("?", 1) + # Split off the storage account URL. + _, path = path.split("blob.core.windows.net/", 1) + + local_path = azcopy_mock_work_dir / path + + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response + + return None + + requests_mock.add_matcher(_matcher) + yield { "client_comm": BlobStorageCommunication(blob_url_with_sas), "host_comm": BlobStorageCommunication(blob_url_with_sas), "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), } - elif request.param == "docker-local": + elif "docker" in request.param: requests_mock.real_http = True client_comm_folder = tmp_path / "communication" # It's important to create the communication folder on the client side already. If we don't, # Docker will create it, and the permissions will then be wrong. client_comm_folder.mkdir(exist_ok=True, parents=True) + azcopy_mock_work_dir.mkdir(exist_ok=True, parents=True) + + # Find the path of the server hook. + server_hook = Path(__file__).parents[0] / "docker_server_hook.py" # Run the Docker container. Assume that it has already been built. + azcopy_path = Path(__file__).parents[0] / "azcopy.py" p = subprocess.Popen( [ "docker", @@ -136,6 +164,17 @@ def mock_foundry_client( "-t", "-v", f"{client_comm_folder}:/communication", + "-v", + f"{azcopy_mock_work_dir}:/azcopy_work", + "--mount", + f"type=bind,src={azcopy_path},dst=/aurora_foundry/azcopy.py,readonly", + "--mount", + ( + f"type=bind" + f",src={server_hook}" + f",dst=/aurora_foundry/aurora/foundry/server/_hook.py" + f",readonly" + ), "aurora-foundry:latest", ], ) @@ -155,11 +194,50 @@ def mock_foundry_client( raise e break - yield { - "client_comm": LocalCommunication(client_comm_folder), - "host_comm": LocalCommunication("/communication"), - "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), - } + if "local" in request.param: + yield { + "client_comm": LocalCommunication(client_comm_folder), + "host_comm": LocalCommunication("/communication"), + "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), + } + else: + # Communicate via blob storage, so mock `azcopy` too. + monkeypatch.setattr( + BlobStorageCommunication, + "_AZCOPY_EXECUTABLE", + ["python", str(azcopy_path), str(azcopy_mock_work_dir)], + ) + # The below test URL must start with `https`! + blob_url_with_sas = ( + "https://storageaccount.blob.core.windows.net/container/folder?SAS" + ) + + def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + if "blob.core.windows.net/" in request.url: + # Split off the SAS token. + path, _ = request.url.split("?", 1) + # Split off the storage account URL. + _, path = path.split("blob.core.windows.net/", 1) + + local_path = azcopy_mock_work_dir / path + + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response + + return None + + requests_mock.add_matcher(_matcher) + + yield { + "client_comm": BlobStorageCommunication(blob_url_with_sas), + "host_comm": BlobStorageCommunication(blob_url_with_sas), + "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), + } finally: p.terminate() diff --git a/tests/foundry/docker_server_hook.py b/tests/foundry/docker_server_hook.py new file mode 100644 index 0000000..8c962a4 --- /dev/null +++ b/tests/foundry/docker_server_hook.py @@ -0,0 +1,49 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +import subprocess +import sys +from pathlib import Path + +# This will be run in the release Docker image, so packages required for mocking are not available. +subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "requests_mock"]) + + +import requests # noqa: E402 +import requests_mock # noqa: E402 + +# First, mock requests that check for the existence of blobs. + + +def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + if "blob.core.windows.net/" in request.url: + # Split off the SAS token. + path, _ = request.url.split("?", 1) + # Split off the storage account URL. + _, path = path.split("blob.core.windows.net/", 1) + + # Assume that the local folder `/azcopy_work` is used by the mock of `azcopy`. + local_path = Path("/azcopy_work") / path + + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response + + return None + + +mock = requests_mock.Mocker().__enter__() +mock.real_http = True +mock.add_matcher(_matcher) + +from aurora.foundry.common.channel import BlobStorageCommunication # noqa: E402 + +# Second, mock `azcopy`, assuming that the `azcopy` mock working directory is `/azcopy_work`. +BlobStorageCommunication._AZCOPY_EXECUTABLE = [ + "python", + "/aurora_foundry/azcopy.py", + "/azcopy_work", +] diff --git a/tests/foundry/runner.py b/tests/foundry/runner.py index bc6a809..e7035cb 100644 --- a/tests/foundry/runner.py +++ b/tests/foundry/runner.py @@ -10,6 +10,8 @@ from pathlib import Path import click +import requests +import requests_mock from flask import Request from werkzeug.test import EnvironBuilder @@ -51,31 +53,54 @@ def main(azcopy_mock_work_path: Path, path: Path) -> None: str(azcopy_mock_work_path), ] - score.init() - - while True: - method = sys.stdin.readline().strip() - base_url = sys.stdin.readline().strip() - query_params = sys.stdin.readline().strip() - headers = json.loads(sys.stdin.readline().encode("utf-8").strip()) - payload = sys.stdin.readline().encode("utf-8").strip() - - builder = EnvironBuilder( - method=method, - base_url=base_url, - query_string=query_params, - headers=headers, - data=payload, - ) - env = builder.get_environ() - flask_request = Request(env) - - resp = score.run(flask_request) - answer = json.dumps(resp).encode("utf-8") if isinstance(resp, dict) else resp.data - - sys.stdout.write(answer.decode("utf-8")) - sys.stdout.write("\n") - sys.stdout.flush() + def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + if "blob.core.windows.net/" in request.url: + # Split off the SAS token. + path, _ = request.url.split("?", 1) + # Split off the storage account URL. + _, path = path.split("blob.core.windows.net/", 1) + + local_path = azcopy_mock_work_path / path + + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response + + return None + + with requests_mock.Mocker() as mock: + mock.real_http = True + mock.add_matcher(_matcher) + + score.init() + + while True: + method = sys.stdin.readline().strip() + base_url = sys.stdin.readline().strip() + query_params = sys.stdin.readline().strip() + headers = json.loads(sys.stdin.readline().encode("utf-8").strip()) + payload = sys.stdin.readline().encode("utf-8").strip() + + builder = EnvironBuilder( + method=method, + base_url=base_url, + query_string=query_params, + headers=headers, + data=payload, + ) + env = builder.get_environ() + flask_request = Request(env) + + resp = score.run(flask_request) + answer = json.dumps(resp).encode("utf-8") if isinstance(resp, dict) else resp.data + + sys.stdout.write(answer.decode("utf-8")) + sys.stdout.write("\n") + sys.stdout.flush() if __name__ == "__main__": From 9930555943340500d1336985b1b7f3727e0b8816 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 23:03:17 +0100 Subject: [PATCH 22/44] simplify submission spec --- aurora/foundry/client/api.py | 2 +- aurora/foundry/common/channel.py | 14 +-- aurora/foundry/server/score.py | 70 ++++++------- tests/foundry/conftest.py | 166 ++++++++++++++----------------- 4 files changed, 111 insertions(+), 141 deletions(-) diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index b3545b9..ad17f49 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -66,7 +66,7 @@ def submit( task = { "model_name": model_name, "num_steps": num_steps, - "host_comm": host_comm.to_spec(), + "data_folder_uri": host_comm.to_spec(), } response = foundry_client.submit_task(task) try: diff --git a/aurora/foundry/common/channel.py b/aurora/foundry/common/channel.py index 83625c4..e1d4063 100644 --- a/aurora/foundry/common/channel.py +++ b/aurora/foundry/common/channel.py @@ -100,7 +100,7 @@ def _is_marked(self, name: str) -> bool: """ @abc.abstractmethod - def to_spec(self) -> dict[str, str]: + def to_spec(self) -> str: """Convert this channel to specification that can be serialised into JSON. Returns: @@ -119,11 +119,8 @@ def __init__(self, folder: str | Path) -> None: """ self.folder = Path(folder) - def to_spec(self) -> dict[str, str]: - return { - "class_name": "LocalCommunication", - "folder": str(self.folder), - } + def to_spec(self) -> str: + return str(self.folder) class Spec(BaseModel): class_name: Literal["LocalCommunication"] @@ -166,10 +163,7 @@ def __init__(self, blob_folder: str) -> None: raise ValueError("Given URL does not appear to contain a SAS token.") def to_spec(self) -> dict[str, str]: - return { - "class_name": "BlobStorageCommunication", - "blob_folder": self.blob_folder, - } + return self.blob_folder class Spec(BaseModel): class_name: Literal["BlobStorageCommunication"] diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index 012e180..1a3c93c 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -23,18 +23,23 @@ # Need to give the name explicitly here, because the script may be run stand-alone. logger = logging.getLogger("aurora.foundry.server.score") -CommSpecs = Union[LocalCommunication.Spec, BlobStorageCommunication.Spec] - class Submission(BaseModel): - host_comm: CommSpecs = Field(..., discriminator="class_name") + data_folder_uri: str model_name: str num_steps: int -class Check(BaseModel): - action: Literal["check"] - uuid: str +class SubmissionResponse(BaseModel): + task_id: str + + +class ProgressInfo(BaseModel): + task_id: str + completed: bool + progress_percentage: int + error: bool + error_info: str POOL = ThreadPoolExecutor(max_workers=1) @@ -53,7 +58,7 @@ def __init__(self, request: Submission): def __call__(self) -> None: try: request = self.request - host_comm = request.host_comm.construct() + host_comm = BlobStorageCommunication(request.data_folder_uri) model_class = models[request.model_name] model = model_class() @@ -95,47 +100,34 @@ def run(input_data: AMLRequest) -> dict: logger.info("Received request.") if input_data.method == "POST": logger.info("Submitting new task to thread pool.") - try: - task = Task(Submission(**input_data.get_json())) - except Exception as exc: - return AMLResponse(dict(message=str(exc)), 500, {}, json_str=True) + task = Task(Submission(**input_data.get_json())) POOL.submit(task) TASKS[task.uuid] = task - return { - "task_id": task.uuid, - } + return SubmissionResponse(task_id=task.uuid).dict() elif input_data.method == "GET": logger.info("Returning the status of an existing task.") - uuid = input_data.args.get("task_id") - if not uuid: - return AMLResponse( - dict(message="Missing task_id query parameter."), - 400, - {}, - json_str=True, - ) - if uuid not in TASKS: - return { - "success": False, - "message": "Task UUID cannot be found.", - } + task_id = input_data.args.get("task_id") + if not task_id: + raise Exception("Missing task_id query parameter.") + if task_id not in TASKS: + raise Exception("Task UUID cannot be found.") else: - task = TASKS[uuid] + task = TASKS[task_id] # Allow the task some time to complete. # We sleep here so the client does not query too frequently. for _ in range(3): if task.completed: break time.sleep(1) - return { - "task_id": uuid, - "completed": task.completed, - "progress_percentage": task.progress_percentage, - "error": task.exc is not None, - "error_info": str(task.exc) if task.exc else "", - } - - else: - # This branch should be unreachable. - return AMLResponse(dict(message="Method not allowed."), 405, {}, json_str=True) + + return ProgressInfo( + task_id=task_id, + completed=task.completed, + progress_percentage=task.progress_percentage, + error=task.exc is not None, + error_info=str(task.exc) if task.exc else "", + ).dict() + + # This branch should be unreachable. + raise Exception("Method not allowed.") diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 0af5c0b..50a8191 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -92,53 +92,44 @@ def mock_foundry_client( # communication via blob storage, we must mock `azcopy` too. comm_folder = tmp_path / "communication" - if "local" in request.param: - # Communicate via a local folder. - yield { - "client_comm": LocalCommunication(comm_folder), - "host_comm": LocalCommunication(comm_folder), - "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), - } - - else: - # Communicate via blob storage, so mock `azcopy` too. - azcopy_path = Path(__file__).parents[0] / "azcopy.py" - monkeypatch.setattr( - BlobStorageCommunication, - "_AZCOPY_EXECUTABLE", - ["python", str(azcopy_path), str(azcopy_mock_work_dir)], - ) - # The below test URL must start with `https`! - blob_url_with_sas = ( - "https://storageaccount.blob.core.windows.net/container/folder?SAS" - ) - - def _matcher(request: requests.Request) -> requests.Response | None: - """Mock requests that check for the existence of blobs.""" - if "blob.core.windows.net/" in request.url: - # Split off the SAS token. - path, _ = request.url.split("?", 1) - # Split off the storage account URL. - _, path = path.split("blob.core.windows.net/", 1) - - local_path = azcopy_mock_work_dir / path - - response = requests.Response() - if local_path.exists(): - response.status_code = 200 - else: - response.status_code = 404 - return response - - return None - - requests_mock.add_matcher(_matcher) - - yield { - "client_comm": BlobStorageCommunication(blob_url_with_sas), - "host_comm": BlobStorageCommunication(blob_url_with_sas), - "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), - } + # Communicate via blob storage, so mock `azcopy` too. + azcopy_path = Path(__file__).parents[0] / "azcopy.py" + monkeypatch.setattr( + BlobStorageCommunication, + "_AZCOPY_EXECUTABLE", + ["python", str(azcopy_path), str(azcopy_mock_work_dir)], + ) + # The below test URL must start with `https`! + blob_url_with_sas = ( + "https://storageaccount.blob.core.windows.net/container/folder?SAS" + ) + + def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + if "blob.core.windows.net/" in request.url: + # Split off the SAS token. + path, _ = request.url.split("?", 1) + # Split off the storage account URL. + _, path = path.split("blob.core.windows.net/", 1) + + local_path = azcopy_mock_work_dir / path + + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response + + return None + + requests_mock.add_matcher(_matcher) + + yield { + "client_comm": BlobStorageCommunication(blob_url_with_sas), + "host_comm": BlobStorageCommunication(blob_url_with_sas), + "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), + } elif "docker" in request.param: requests_mock.real_http = True @@ -194,50 +185,43 @@ def _matcher(request: requests.Request) -> requests.Response | None: raise e break - if "local" in request.param: - yield { - "client_comm": LocalCommunication(client_comm_folder), - "host_comm": LocalCommunication("/communication"), - "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), - } - else: - # Communicate via blob storage, so mock `azcopy` too. - monkeypatch.setattr( - BlobStorageCommunication, - "_AZCOPY_EXECUTABLE", - ["python", str(azcopy_path), str(azcopy_mock_work_dir)], - ) - # The below test URL must start with `https`! - blob_url_with_sas = ( - "https://storageaccount.blob.core.windows.net/container/folder?SAS" - ) - - def _matcher(request: requests.Request) -> requests.Response | None: - """Mock requests that check for the existence of blobs.""" - if "blob.core.windows.net/" in request.url: - # Split off the SAS token. - path, _ = request.url.split("?", 1) - # Split off the storage account URL. - _, path = path.split("blob.core.windows.net/", 1) - - local_path = azcopy_mock_work_dir / path - - response = requests.Response() - if local_path.exists(): - response.status_code = 200 - else: - response.status_code = 404 - return response - - return None - - requests_mock.add_matcher(_matcher) - - yield { - "client_comm": BlobStorageCommunication(blob_url_with_sas), - "host_comm": BlobStorageCommunication(blob_url_with_sas), - "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), - } + # Communicate via blob storage, so mock `azcopy` too. + monkeypatch.setattr( + BlobStorageCommunication, + "_AZCOPY_EXECUTABLE", + ["python", str(azcopy_path), str(azcopy_mock_work_dir)], + ) + # The below test URL must start with `https`! + blob_url_with_sas = ( + "https://storageaccount.blob.core.windows.net/container/folder?SAS" + ) + + def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + if "blob.core.windows.net/" in request.url: + # Split off the SAS token. + path, _ = request.url.split("?", 1) + # Split off the storage account URL. + _, path = path.split("blob.core.windows.net/", 1) + + local_path = azcopy_mock_work_dir / path + + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response + + return None + + requests_mock.add_matcher(_matcher) + + yield { + "client_comm": BlobStorageCommunication(blob_url_with_sas), + "host_comm": BlobStorageCommunication(blob_url_with_sas), + "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), + } finally: p.terminate() From 916d13567b788fa4183a339be4e4fb0275b59f50 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Thu, 9 Jan 2025 23:05:43 +0100 Subject: [PATCH 23/44] linting --- aurora/foundry/common/channel.py | 2 +- aurora/foundry/server/score.py | 5 +---- tests/foundry/conftest.py | 10 +++------- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/aurora/foundry/common/channel.py b/aurora/foundry/common/channel.py index e1d4063..0489579 100644 --- a/aurora/foundry/common/channel.py +++ b/aurora/foundry/common/channel.py @@ -162,7 +162,7 @@ def __init__(self, blob_folder: str) -> None: if "?" not in blob_folder: raise ValueError("Given URL does not appear to contain a SAS token.") - def to_spec(self) -> dict[str, str]: + def to_spec(self) -> str: return self.blob_folder class Spec(BaseModel): diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index 1a3c93c..6a61a1e 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -3,17 +3,14 @@ import logging import time from concurrent.futures import ThreadPoolExecutor -from typing import Literal, Union from uuid import uuid4 from azureml_inference_server_http.api.aml_request import AMLRequest, rawhttp -from azureml_inference_server_http.api.aml_response import AMLResponse -from pydantic import BaseModel, Field +from pydantic import BaseModel import aurora.foundry.server._hook # noqa: F401 from aurora.foundry.common.channel import ( BlobStorageCommunication, - LocalCommunication, iterate_prediction_files, ) from aurora.foundry.common.model import models diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 50a8191..2e8072f 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -12,7 +12,7 @@ import requests from aurora.foundry.client.foundry import FoundryClient -from aurora.foundry.common.channel import BlobStorageCommunication, LocalCommunication +from aurora.foundry.common.channel import BlobStorageCommunication MOCK_ADDRESS = "https://mock-foundry.azurewebsites.net" @@ -100,9 +100,7 @@ def mock_foundry_client( ["python", str(azcopy_path), str(azcopy_mock_work_dir)], ) # The below test URL must start with `https`! - blob_url_with_sas = ( - "https://storageaccount.blob.core.windows.net/container/folder?SAS" - ) + blob_url_with_sas = "https://storageaccount.blob.core.windows.net/container/folder?SAS" def _matcher(request: requests.Request) -> requests.Response | None: """Mock requests that check for the existence of blobs.""" @@ -192,9 +190,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: ["python", str(azcopy_path), str(azcopy_mock_work_dir)], ) # The below test URL must start with `https`! - blob_url_with_sas = ( - "https://storageaccount.blob.core.windows.net/container/folder?SAS" - ) + blob_url_with_sas = "https://storageaccount.blob.core.windows.net/container/folder?SAS" def _matcher(request: requests.Request) -> requests.Response | None: """Mock requests that check for the existence of blobs.""" From acbf8db08600d15858d9c8a7861830185137f2ed Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Fri, 10 Jan 2025 00:17:37 +0100 Subject: [PATCH 24/44] auto-generate swagger-file --- Makefile | 6 +- aurora/foundry/server/generate-swagger.py | 82 +++++++ aurora/foundry/server/score.py | 23 ++ aurora/foundry/server/swagger3.json | 250 ++++++++++------------ tests/foundry/conftest.py | 4 - 5 files changed, 227 insertions(+), 138 deletions(-) create mode 100644 aurora/foundry/server/generate-swagger.py diff --git a/Makefile b/Makefile index 9aecbce..8a4c183 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: install test docs docker-requirements docker +.PHONY: install test docs docker-requirements docker swagger-file install: pip install --upgrade pip @@ -19,3 +19,7 @@ docker-requirements: pyproject.toml docker: (pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t aurora-foundry:latest . + +swagger-file: + pip install fastapi + python aurora/foundry/server/generate-swagger.py aurora/foundry/server/swagger3.json diff --git a/aurora/foundry/server/generate-swagger.py b/aurora/foundry/server/generate-swagger.py new file mode 100644 index 0000000..94dccb5 --- /dev/null +++ b/aurora/foundry/server/generate-swagger.py @@ -0,0 +1,82 @@ +import json +import sys + +from fastapi import Depends, FastAPI, HTTPException, Query, status +from fastapi.security import APIKeyHeader +from score import ProgressInfo, Submission, SubmissionResponse + +app = FastAPI( + title="Aurora", + description="Evaluate Aurora Model", + version="1.0.0", +) +security = APIKeyHeader( + name="Authorization", auto_error=False, description="Example 'Bearer myApiKey'" +) + + +async def get_api_key(api_key: str = Depends(security)): + if api_key is None or not api_key.startswith("Bearer "): + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or missing API key", + headers={"WWW-Authenticate": "Bearer"}, + ) + token = api_key.split("Bearer ") + # Here you can add your logic to validate the token + if token != "your_actual_api_key": + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid API key", + headers={"WWW-Authenticate": "Bearer"}, + ) + return token + + +# POST method on /score endpoint +@app.post( + "/score", + response_model=SubmissionResponse, + summary="Submit a new task", +) +async def create_score(input: Submission, token: str = Depends(get_api_key)): ... + + +# GET method on /score endpoint +@app.get("/score", response_model=ProgressInfo, summary="Get progress info for a given task") +async def get_score( + task_id: str = Query( + ..., + description="The ID of the task", + examples=dict(task_id=dict(value="abc-123-def", summary="Sample Task ID")), + ), + token: str = Depends(get_api_key), +): ... + + +# Liveness route +@app.get("/", summary="Succeeds when the service is ready.") +async def read_root(): + return "Healthy" + + +# Route to get the Swagger file +@app.get("/swagger.json") +async def get_swagger(): + return app.openapi() + + +def dump_openapi_spec(fn): + openapi_spec = app.openapi() + with open(fn, "w") as f: + json.dump(openapi_spec, f, indent=2) + print(f"OpenAPI spec dumped to {fn}") + + +if __name__ == "__main__": + if not sys.argv[1:]: + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=8000) + else: + dump_openapi_spec(sys.argv[1]) diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index 6a61a1e..d51c6ce 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -26,10 +26,22 @@ class Submission(BaseModel): model_name: str num_steps: int + class Config: + json_schema_extra = dict( + example=dict( + data_folder_uri="https://my.blob.core.windows.net/container/some/path?WRITABLE_SAS", + model_name="aurora-0.25-small-pretrained", + num_steps=5, + ) + ) + class SubmissionResponse(BaseModel): task_id: str + class Config: + json_schema_extra = dict(example=dict(task_id="abc-123-def")) + class ProgressInfo(BaseModel): task_id: str @@ -38,6 +50,17 @@ class ProgressInfo(BaseModel): error: bool error_info: str + class Config: + json_schema_extra = dict( + example=dict( + task_id="abc-123-def", + completed=True, + progress_percentage=100, + error=False, + error_info="", + ) + ) + POOL = ThreadPoolExecutor(max_workers=1) TASKS = dict() diff --git a/aurora/foundry/server/swagger3.json b/aurora/foundry/server/swagger3.json index e8db81a..8c399db 100644 --- a/aurora/foundry/server/swagger3.json +++ b/aurora/foundry/server/swagger3.json @@ -1,65 +1,29 @@ { "openapi": "3.1.0", "info": { - "title": "Aurora Endpoint", - "version": "0.1.0" + "title": "Aurora", + "description": "Evaluate Aurora Model", + "version": "1.0.0" }, - "security": [ - { - "Bearer": [] - } - ], "paths": { - "/": { - "get": { - "summary": "Check if alive", - "operationId": "ServiceHealthCheck", - "description": "Simple health check endpoint to ensure the service is up at any given point.", - "responses": { - "200": { - "description": "If service is up and running, this response will be returned with the content 'Healthy'", - "content": { - "text/plain": { - "schema": { - "type": "string", - "examples": [ - "Healthy" - ] - } - } - } - }, - "default": { - "description": "The service failed to execute due to an error.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - } - } - } - } - } - } - }, "/score": { "post": { + "summary": "Submit a new task", + "operationId": "create_score_score_post", "security": [ { - "Bearer": [] + "APIKeyHeader": [] } ], - "summary": "Create Task", - "operationId": "create_item_score__post", "requestBody": { + "required": true, "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/CreateTask" + "$ref": "#/components/schemas/Submission" } } - }, - "required": true + } }, "responses": { "200": { @@ -67,7 +31,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/CreateTaskResponse" + "$ref": "#/components/schemas/SubmissionResponse" } } } @@ -85,86 +49,91 @@ } }, "get": { + "summary": "Get progress info for a given task", + "operationId": "get_score_score_get", "security": [ { - "Bearer": [] + "APIKeyHeader": [] } ], - "summary": "Fetch Progress of a Task", - "operationId": "get_item_score_post", "parameters": [ { "name": "task_id", "in": "query", "required": true, "schema": { - "type": "string" - } + "type": "string", + "description": "The ID of the task", + "examples": { + "task_id": { + "value": "abc-123-def", + "summary": "Sample Task ID" + } + }, + "title": "Task Id" + }, + "description": "The ID of the task" } ], "responses": { "200": { - "description": "Successfull Response", + "description": "Successful Response", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Answer" + "$ref": "#/components/schemas/ProgressInfo" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" } } } } } } - } - }, - "components": { - "schemas": { - "CreateTaskResponse": { - "properties": { - "success": { - "type": "boolean", - "title": "Success" - }, - "message": { - "type": "string", - "title": "Message" - }, - "task_id": { - "type": "string", - "title": "Task ID" + }, + "/": { + "get": { + "summary": "Succeeds when the service is ready.", + "operationId": "read_root__get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } } } - }, - "ErrorResponse": { - "properties": { - "message": { - "type": "string" + } + }, + "/swagger.json": { + "get": { + "summary": "Get Swagger", + "operationId": "get_swagger_swagger_json_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } } } - }, - "Answer": { - "properties": { - "success": { - "type": "boolean", - "title": "Success" - }, - "message": { - "type": "string", - "title": "Message" - }, - "data": { - "$ref": "#/components/schemas/ProgressInfo", - "title": "Data" - } - }, - "type": "object", - "required": [ - "success", - "message", - "data" - ], - "title": "Answer" - }, + } + } + }, + "components": { + "schemas": { "HTTPValidationError": { "properties": { "detail": { @@ -180,13 +149,9 @@ }, "ProgressInfo": { "properties": { - "kind": { - "const": "progress_info", - "title": "Kind" - }, - "uuid": { + "task_id": { "type": "string", - "title": "Uuid" + "title": "Task Id" }, "completed": { "type": "boolean", @@ -206,10 +171,28 @@ } }, "type": "object", - "title": "ProgressInfo" + "required": [ + "task_id", + "completed", + "progress_percentage", + "error", + "error_info" + ], + "title": "ProgressInfo", + "example": { + "completed": true, + "error": false, + "error_info": "", + "progress_percentage": 100, + "task_id": "abc-123-def" + } }, - "CreateTask": { + "Submission": { "properties": { + "data_folder_uri": { + "type": "string", + "title": "Data Folder Uri" + }, "model_name": { "type": "string", "title": "Model Name" @@ -217,35 +200,36 @@ "num_steps": { "type": "integer", "title": "Num Steps" - }, - "client_comm": { - "type": "string", - "title": "Client Comm" - }, - "host_comm": { - "type": "string", - "title": "Host Comm" } }, "type": "object", "required": [ - "request" + "data_folder_uri", + "model_name", + "num_steps" ], - "title": "CreateTask" + "title": "Submission", + "example": { + "data_folder_uri": "https://my.blob.core.windows.net/container/some/path?WRITABLE_SAS", + "model_name": "aurora-0.25-small-pretrained", + "num_steps": 5 + } }, - "SubmissionInfo": { + "SubmissionResponse": { "properties": { - "kind": { - "const": "submission_info", - "title": "Kind" - }, - "uuid": { + "task_id": { "type": "string", - "title": "Uuid" + "title": "Task Id" } }, "type": "object", - "title": "SubmissionInfo" + "required": [ + "task_id" + ], + "title": "SubmissionResponse", + "example": { + "task_id": "abc-123-def" + } }, "ValidationError": { "properties": { @@ -279,14 +263,14 @@ "type" ], "title": "ValidationError" - }, - "securitySchemes": { - "Bearer": { - "type": "apiKey", - "name": "Authorization", - "in": "header", - "description": "For example Bearer abc123" - } + } + }, + "securitySchemes": { + "APIKeyHeader": { + "type": "apiKey", + "description": "Example 'Bearer myApiKey'", + "in": "header", + "name": "Authorization" } } } diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 2e8072f..95f6fb3 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -88,10 +88,6 @@ def mock_foundry_client( stdin, stdout, ), mock_foundry_responses_subprocess(stdin, stdout, requests_mock): - # Now we decide whether we do communication locally or via blob storage. If we do - # communication via blob storage, we must mock `azcopy` too. - comm_folder = tmp_path / "communication" - # Communicate via blob storage, so mock `azcopy` too. azcopy_path = Path(__file__).parents[0] / "azcopy.py" monkeypatch.setattr( From e3b670cfa0b467a4b02a2566d4707c6e8319fc48 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Fri, 10 Jan 2025 01:00:23 +0100 Subject: [PATCH 25/44] add make target to build docker image on ACR --- Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8a4c183..906003e 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,12 @@ docker-requirements: pyproject.toml docker: (pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm - AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t aurora-foundry:latest . + AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t testwsacr.azurecr.io/aurora-foundry:20250110-1 . + +docker-acr: + (pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm + [ ! -z "$(ACR)" ] + AURORA_REPO_VERSION=`python -m setuptools_scm` az acr build --build-arg AURORA_REPO_VERSION -r "$(ACR)" -t aurora-foundry:20250110-1 . swagger-file: pip install fastapi From 386dbcd80cd14e1c4c4481a97810c25c94152f0b Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 10:15:40 +0100 Subject: [PATCH 26/44] Simplify test and use right Docker image --- .github/workflows/ci.yaml | 4 +- Makefile | 9 +- aurora/foundry/server/score.py | 8 +- tests/foundry/conftest.py | 155 ++++++++++++--------------------- 4 files changed, 68 insertions(+), 108 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8385f71..d9f4fab 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,7 +28,7 @@ jobs: - name: Build Foundry image run: | - make docker + DOCKER_IMAGE=aurora-foundry make docker - name: Install dependencies run: | @@ -37,4 +37,4 @@ jobs: - name: Run tests run: | - pytest -v --cov=aurora --cov-report term-missing + DOCKER_IMAGE=aurora-foundry pytest -v --cov=aurora --cov-report term-missing diff --git a/Makefile b/Makefile index 906003e..6f3dde8 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,15 @@ .PHONY: install test docs docker-requirements docker swagger-file +DOCKER_WS ?= testwsacr +DOCKER_IMAGE ?= aurora-foundry:20250110-1 + install: pip install --upgrade pip pip install -e ".[dev]" pre-commit install test: - pytest tests -v --cov=aurora --cov-report=term --cov-report=html + DOCKER_IMAGE=$(DOCKER_WS).azurecr.io/$(DOCKER_IMAGE) pytest tests -v --cov=aurora --cov-report=term --cov-report=html docs: jupyter-book build docs @@ -18,12 +21,12 @@ docker-requirements: pyproject.toml docker: (pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm - AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t testwsacr.azurecr.io/aurora-foundry:20250110-1 . + AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t $(DOCKER_WS).azurecr.io/$(DOCKER_IMAGE) . docker-acr: (pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm [ ! -z "$(ACR)" ] - AURORA_REPO_VERSION=`python -m setuptools_scm` az acr build --build-arg AURORA_REPO_VERSION -r "$(ACR)" -t aurora-foundry:20250110-1 . + AURORA_REPO_VERSION=`python -m setuptools_scm` az acr build --build-arg AURORA_REPO_VERSION -r "$(ACR)" -t $(DOCKER_IMAGE) . swagger-file: pip install fastapi diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index d51c6ce..1406036 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -118,6 +118,7 @@ def run(input_data: AMLRequest) -> dict: dict/AMLResponse: The response to the request. dicts are implictitly 200 AMLResponses. """ logger.info("Received request.") + if input_data.method == "POST": logger.info("Submitting new task to thread pool.") task = Task(Submission(**input_data.get_json())) @@ -129,9 +130,9 @@ def run(input_data: AMLRequest) -> dict: logger.info("Returning the status of an existing task.") task_id = input_data.args.get("task_id") if not task_id: - raise Exception("Missing task_id query parameter.") + raise Exception("Missing `task_id` query parameter.") if task_id not in TASKS: - raise Exception("Task UUID cannot be found.") + raise Exception("Task ID cannot be found.") else: task = TASKS[task_id] # Allow the task some time to complete. @@ -149,5 +150,4 @@ def run(input_data: AMLRequest) -> dict: error_info=str(task.exc) if task.exc else "", ).dict() - # This branch should be unreachable. - raise Exception("Method not allowed.") + raise Exception("Method not allowed.") # This branch should be unreachable. diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 95f6fb3..8a22bb8 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -1,6 +1,7 @@ """Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" import json +import os import re import subprocess import time @@ -18,7 +19,7 @@ @contextmanager -def runner_process(azcopy_mock_work_dir): +def runner_process(azcopy_mock_work_dir: Path): score_script_path = Path(__file__).parents[2] / "aurora/foundry/server/score.py" runner_path = Path(__file__).parents[0] / "runner.py" p = subprocess.Popen( @@ -63,82 +64,70 @@ def _mock_send(request, context) -> dict: yield -@pytest.fixture( - params=[ - "subprocess-local", - "subprocess-blob", - "docker-local", - "docker-blob", - ] -) +@pytest.fixture(params=["subprocess", "docker"]) def mock_foundry_client( request, monkeypatch, requests_mock, tmp_path: Path, ) -> Generator[dict, None, None]: + # Communicate via blob storage, so mock `azcopy` too. azcopy_mock_work_dir = tmp_path / "azcopy_work" - - if "subprocess" in request.param: - # Already determine a possible working path for the mock of `azcopy`. It might not be used, - # but we do already need to determine it. - - with runner_process(azcopy_mock_work_dir) as ( - p, - stdin, - stdout, - ), mock_foundry_responses_subprocess(stdin, stdout, requests_mock): - # Communicate via blob storage, so mock `azcopy` too. - azcopy_path = Path(__file__).parents[0] / "azcopy.py" - monkeypatch.setattr( - BlobStorageCommunication, - "_AZCOPY_EXECUTABLE", - ["python", str(azcopy_path), str(azcopy_mock_work_dir)], - ) - # The below test URL must start with `https`! - blob_url_with_sas = "https://storageaccount.blob.core.windows.net/container/folder?SAS" - - def _matcher(request: requests.Request) -> requests.Response | None: - """Mock requests that check for the existence of blobs.""" - if "blob.core.windows.net/" in request.url: - # Split off the SAS token. - path, _ = request.url.split("?", 1) - # Split off the storage account URL. - _, path = path.split("blob.core.windows.net/", 1) - - local_path = azcopy_mock_work_dir / path - - response = requests.Response() - if local_path.exists(): - response.status_code = 200 - else: - response.status_code = 404 - return response - - return None - - requests_mock.add_matcher(_matcher) - - yield { - "client_comm": BlobStorageCommunication(blob_url_with_sas), - "host_comm": BlobStorageCommunication(blob_url_with_sas), - "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), - } - - elif "docker" in request.param: + # It's important to already create the work folder. If we don't then the Docker image will + # create it, and the permissions will then be wrong. + azcopy_mock_work_dir.mkdir(exist_ok=True, parents=True) + azcopy_path = Path(__file__).parents[0] / "azcopy.py" + monkeypatch.setattr( + BlobStorageCommunication, + "_AZCOPY_EXECUTABLE", + ["python", str(azcopy_path), str(azcopy_mock_work_dir)], + ) + # The below test URL must start with `https`! + blob_url_with_sas = "https://storageaccount.blob.core.windows.net/container/folder?SAS" + + def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + if "blob.core.windows.net/" in request.url: + # Split off the SAS token. + path, _ = request.url.split("?", 1) + # Split off the storage account URL. + _, path = path.split("blob.core.windows.net/", 1) + + local_path = azcopy_mock_work_dir / path + + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response + + return None + + requests_mock.add_matcher(_matcher) + + if request.param == "subprocess": + with runner_process(azcopy_mock_work_dir) as (p, stdin, stdout): # noqa: SIM117 + with mock_foundry_responses_subprocess(stdin, stdout, requests_mock): + yield { + "client_comm": BlobStorageCommunication(blob_url_with_sas), + "host_comm": BlobStorageCommunication(blob_url_with_sas), + "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), + } + + elif request.param == "docker": requests_mock.real_http = True - client_comm_folder = tmp_path / "communication" - # It's important to create the communication folder on the client side already. If we don't, - # Docker will create it, and the permissions will then be wrong. - client_comm_folder.mkdir(exist_ok=True, parents=True) - azcopy_mock_work_dir.mkdir(exist_ok=True, parents=True) + if "DOCKER_IMAGE" not in os.environ: + raise RuntimeError( + "Set the environment variable `DOCKER_IMAGE` " + "to the release image of Aurora Foundry." + ) + docker_image = os.environ["DOCKER_IMAGE"] - # Find the path of the server hook. + # Run the Docker container. Assume that it has already been built. Insert the hook + # to mock things on the server side. server_hook = Path(__file__).parents[0] / "docker_server_hook.py" - - # Run the Docker container. Assume that it has already been built. - azcopy_path = Path(__file__).parents[0] / "azcopy.py" p = subprocess.Popen( [ "docker", @@ -148,8 +137,6 @@ def _matcher(request: requests.Request) -> requests.Response | None: "--rm", "-t", "-v", - f"{client_comm_folder}:/communication", - "-v", f"{azcopy_mock_work_dir}:/azcopy_work", "--mount", f"type=bind,src={azcopy_path},dst=/aurora_foundry/azcopy.py,readonly", @@ -160,7 +147,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: f",dst=/aurora_foundry/aurora/foundry/server/_hook.py" f",readonly" ), - "aurora-foundry:latest", + docker_image, ], ) try: @@ -179,36 +166,6 @@ def _matcher(request: requests.Request) -> requests.Response | None: raise e break - # Communicate via blob storage, so mock `azcopy` too. - monkeypatch.setattr( - BlobStorageCommunication, - "_AZCOPY_EXECUTABLE", - ["python", str(azcopy_path), str(azcopy_mock_work_dir)], - ) - # The below test URL must start with `https`! - blob_url_with_sas = "https://storageaccount.blob.core.windows.net/container/folder?SAS" - - def _matcher(request: requests.Request) -> requests.Response | None: - """Mock requests that check for the existence of blobs.""" - if "blob.core.windows.net/" in request.url: - # Split off the SAS token. - path, _ = request.url.split("?", 1) - # Split off the storage account URL. - _, path = path.split("blob.core.windows.net/", 1) - - local_path = azcopy_mock_work_dir / path - - response = requests.Response() - if local_path.exists(): - response.status_code = 200 - else: - response.status_code = 404 - return response - - return None - - requests_mock.add_matcher(_matcher) - yield { "client_comm": BlobStorageCommunication(blob_url_with_sas), "host_comm": BlobStorageCommunication(blob_url_with_sas), From 1060e4abd9ef5d39d9728c027f37a484ee9644fd Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 11:49:07 +0100 Subject: [PATCH 27/44] Rework interaction between client and host --- Makefile | 2 +- aurora/foundry/client/api.py | 57 +++++--- aurora/foundry/common/channel.py | 126 ++++++++++++------ ...enerate-swagger.py => generate_swagger.py} | 12 +- aurora/foundry/server/score.py | 118 ++++++++++------ aurora/foundry/server/swagger3.json | 111 ++++++++------- 6 files changed, 268 insertions(+), 158 deletions(-) rename aurora/foundry/server/{generate-swagger.py => generate_swagger.py} (85%) diff --git a/Makefile b/Makefile index 6f3dde8..cffd08d 100644 --- a/Makefile +++ b/Makefile @@ -30,4 +30,4 @@ docker-acr: swagger-file: pip install fastapi - python aurora/foundry/server/generate-swagger.py aurora/foundry/server/swagger3.json + python aurora/foundry/server/generate_swagger.py aurora/foundry/server/swagger3.json diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index ad17f49..5cb7225 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -18,16 +18,17 @@ logger = logging.getLogger(__name__) -class SubmissionInfo(BaseModel): +class CreationInfo(BaseModel): task_id: str -class ProgressInfo(BaseModel): +class TaskInfo(BaseModel): task_id: str completed: bool progress_percentage: int - error: bool - error_info: str + success: bool | None + submitted: bool + status: str class SubmissionError(Exception): @@ -62,7 +63,7 @@ def submit( if model_name not in models: raise KeyError(f"Model `{model_name}` is not a valid model.") - # Send a request to the endpoint to produce the predictions. + # Create a task at the endpoint. task = { "model_name": model_name, "num_steps": num_steps, @@ -70,32 +71,46 @@ def submit( } response = foundry_client.submit_task(task) try: - submission_info = SubmissionInfo(**response) + submission_info = CreationInfo(**response) except Exception as e: - raise SubmissionError(response["message"]) from e + raise SubmissionError("Failed to create task.") from e task_id = submission_info.task_id - logger.info("Submitted task %r to endpoint.", task_id) + logger.info(f"Created task `{task_id}` at endpoint.") # Send the initial condition over. client_comm.send(batch, task_id, "input.nc") + previous_status: str = "No Status" previous_progress: int = 0 while True: - # Check on the progress of the task. + # Check on the progress of the task. The first progress check will trigger the task to be + # submitted. response = foundry_client.get_progress(task_id) - progress_info = ProgressInfo(**response) - - if progress_info.error: - raise SubmissionError(f"Task failed: {progress_info.error_info}") - - if progress_info.progress_percentage > previous_progress: - logger.info(f"Task progress update: {progress_info.progress_percentage}%.") - previous_progress = progress_info.progress_percentage - - if progress_info.completed: - logger.info("Task has been completed!") - break + task_info = TaskInfo(**response) + + if task_info.submitted: + # If the task has been submitted, we must be able to read the acknowledgement of the + # initial condition. + try: + client_comm.read(task_id, "input.nc.ack", timeout=120) + except TimeoutError as e: + raise SubmissionError("Could not read acknowledgement of initial condition.") from e + + if task_info.status != previous_status: + logger.info(f"Task status update: {task_info.status}") + previous_status = task_info.status + + if task_info.progress_percentage > previous_progress: + logger.info(f"Task progress update: {task_info.progress_percentage}%.") + previous_progress = task_info.progress_percentage + + if task_info.completed: + if task_info.success: + logger.info("Task has been successfully completed!") + break + else: + raise SubmissionError(f"Task failed: {task_info.status}") logger.info("Retrieving predictions.") for prediction_name in iterate_prediction_files("prediction.nc", num_steps): diff --git a/aurora/foundry/common/channel.py b/aurora/foundry/common/channel.py index 0489579..d1bfd5b 100644 --- a/aurora/foundry/common/channel.py +++ b/aurora/foundry/common/channel.py @@ -16,7 +16,6 @@ __all__ = [ "CommunicationChannel", - "LocalCommunication", "BlobStorageCommunication", "iterate_prediction_files", ] @@ -39,7 +38,7 @@ def send(self, batch: Batch, uuid: str, name: str) -> None: self._send(batch, name) self._mark(name) - def receive(self, uuid: str, name: str) -> Batch: + def receive(self, uuid: str, name: str, timeout: int = 120) -> Batch: """Receive the batch under the file name `name`. This function blocks until the file is ready. @@ -47,15 +46,65 @@ def receive(self, uuid: str, name: str) -> Batch: Args: uuid (str): UUID of the task. name (str): Name to receive. + timeout (int, optional): Timeout in seconds. Defaults to 2 minutes. Returns: :class:`aurora.Batch`: Batch under the name `name`. """ name = f"{uuid}/{name}" + start = time.time() while not self._is_marked(name): - time.sleep(0.5) + if time.time() - start < timeout: + time.sleep(1) + else: + raise TimeoutError("File was not marked within the timeout.") return self._receive(name) + def write(self, data: bytes, uuid: str, name: str) -> None: + """Write `data` to `name`. + + Args: + Data (bytes): Data to write. + uuid (str): UUID of the task. + name (str): Name to write to. + """ + name = f"{uuid}/{name}" + self._write(data, name) + self._mark(name) + + def read(self, uuid: str, name: str, timeout: int = 120) -> bytes: + """Read `name`. + + Args: + uuid (str): UUID of the task. + name (str): Name to read. + timeout (int, optional): Timeout in seconds. Defaults to 2 minutes. + + Returns: + bytes: Data of `name`. + """ + name = f"{uuid}/{name}" + start = time.time() + while not self._is_marked(name): + if time.time() - start < timeout: + time.sleep(1) + else: + raise TimeoutError("File was not marked within the timeout.") + return self._read(name) + + def exists(self, uuid: str, name: str) -> bool: + """Check whether `name` is available. + + Args: + uuid (str): UUID of the task. + name (str): Name to check for. + + Returns: + bool: Wether `name` is available. + """ + name = f"{uuid}/{name}" + return self._is_marked(name) + @abc.abstractmethod def _send(self, batch: Batch, name: str) -> None: """Send `batch` under the file name `name` without marking the file. @@ -80,6 +129,27 @@ def _receive(self, name: str) -> Batch: :class:`aurora.Batch`: Batch under the file name `name`. """ + @abc.abstractmethod + def _write(self, data: bytes, name: str) -> None: + """Send the data `data` under the file name `name` without marking the file. + + This method should be implemented. + + Args: + data (bytes): Data to send. + name (str): Name of `data`. + """ + + @abc.abstractmethod + def _read(self, name: str) -> bytes: + """Read data from `name`. + + This function asserts that the file is ready and should be implemented by implementations. + + Args: + name (str): Name to read. + """ + @abc.abstractmethod def _mark(self, name: str) -> None: """Mark the file `name` as done. @@ -108,44 +178,6 @@ def to_spec(self) -> str: """ -class LocalCommunication(CommunicationChannel): - """A communication channel via a local folder.""" - - def __init__(self, folder: str | Path) -> None: - """Instantiate. - - Args: - folder (str or Path): Folder to use. - """ - self.folder = Path(folder) - - def to_spec(self) -> str: - return str(self.folder) - - class Spec(BaseModel): - class_name: Literal["LocalCommunication"] - folder: Path - - def construct(self) -> "LocalCommunication": - return LocalCommunication(folder=str(self.folder)) - - def _send(self, batch: Batch, name: str) -> None: - target = self.folder / name - target.parent.mkdir(exist_ok=True, parents=True) - batch.to_netcdf(target) - - def _receive(self, name: str) -> Batch: - return Batch.from_netcdf(self.folder / name) - - def _mark(self, name: str) -> None: - target = self.folder / f"{name}.finished" - target.parent.mkdir(exist_ok=True, parents=True) - target.touch() - - def _is_marked(self, name: str) -> bool: - return (self.folder / f"{name}.finished").exists() - - class BlobStorageCommunication(CommunicationChannel): """A communication channel via a folder in a Azure Blob Storage container.""" @@ -203,6 +235,18 @@ def _receive(self, name: str) -> Batch: self._azcopy(["copy", self._blob_path(name), tf.name]) return Batch.from_netcdf(tf.name) + def _write(self, data: bytes, name: str) -> None: + with tempfile.NamedTemporaryFile() as tf: + with open(tf.name, "wb") as f: + f.write(data) + self._azcopy(["copy", tf.name, self._blob_path(name)]) + + def _read(self, name: str) -> bytes: + with tempfile.NamedTemporaryFile() as tf: + self._azcopy(["copy", self._blob_path(name), tf.name]) + with open(tf.name, "rb") as f: + return f.read() + def _mark(self, name: str) -> None: with tempfile.TemporaryDirectory() as td: mark_file_path = Path(td) / f"{name}.finished" diff --git a/aurora/foundry/server/generate-swagger.py b/aurora/foundry/server/generate_swagger.py similarity index 85% rename from aurora/foundry/server/generate-swagger.py rename to aurora/foundry/server/generate_swagger.py index 94dccb5..15e3462 100644 --- a/aurora/foundry/server/generate-swagger.py +++ b/aurora/foundry/server/generate_swagger.py @@ -1,13 +1,15 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + import json import sys from fastapi import Depends, FastAPI, HTTPException, Query, status from fastapi.security import APIKeyHeader -from score import ProgressInfo, Submission, SubmissionResponse +from score import CreationResponse, Submission, TaskInfo app = FastAPI( title="Aurora", - description="Evaluate Aurora Model", + description="Produce predictions with the Aurora model", version="1.0.0", ) security = APIKeyHeader( @@ -36,14 +38,14 @@ async def get_api_key(api_key: str = Depends(security)): # POST method on /score endpoint @app.post( "/score", - response_model=SubmissionResponse, - summary="Submit a new task", + response_model=CreationResponse, + summary="Create a new task", ) async def create_score(input: Submission, token: str = Depends(get_api_key)): ... # GET method on /score endpoint -@app.get("/score", response_model=ProgressInfo, summary="Get progress info for a given task") +@app.get("/score", response_model=TaskInfo, summary="Update an existing task") async def get_score( task_id: str = Query( ..., diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index 1406036..3ad2f4f 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -6,7 +6,7 @@ from uuid import uuid4 from azureml_inference_server_http.api.aml_request import AMLRequest, rawhttp -from pydantic import BaseModel +from pydantic import BaseModel, HttpUrl import aurora.foundry.server._hook # noqa: F401 from aurora.foundry.common.channel import ( @@ -22,7 +22,7 @@ class Submission(BaseModel): - data_folder_uri: str + data_folder_uri: HttpUrl model_name: str num_steps: int @@ -36,19 +36,20 @@ class Config: ) -class SubmissionResponse(BaseModel): +class CreationResponse(BaseModel): task_id: str class Config: json_schema_extra = dict(example=dict(task_id="abc-123-def")) -class ProgressInfo(BaseModel): +class TaskInfo(BaseModel): task_id: str completed: bool progress_percentage: int - error: bool - error_info: str + success: bool | None + submitted: bool + status: str class Config: json_schema_extra = dict( @@ -56,8 +57,9 @@ class Config: task_id="abc-123-def", completed=True, progress_percentage=100, - error=False, - error_info="", + success=None, + submitted=True, + error_info="Queued", ) ) @@ -67,43 +69,56 @@ class Config: class Task: - def __init__(self, request: Submission): - self.request: Submission = request - # TODO: Make sure that this `uuid` really is unique! - self.uuid: str = str(uuid4()) - self.progress_percentage: int = 0 - self.completed: bool = False - self.exc: Exception | None = None + def __init__(self, submission: Submission): + self.submission: Submission = submission + + self.task_info = TaskInfo( + # TODO: Make sure that this `uuid` really is unique! + task_id=str(uuid4()), + completed=False, + progress_percentage=0, + success=None, + submitted=False, + status="Unsubmitted", + ) def __call__(self) -> None: + self.task_info.status = "Running" + try: - request = self.request - host_comm = BlobStorageCommunication(request.data_folder_uri) + submission = self.submission + host_comm = BlobStorageCommunication(str(submission.data_folder_uri)) - model_class = models[request.model_name] + model_class = models[submission.model_name] model = model_class() - batch = host_comm.receive(self.uuid, "input.nc") + batch = host_comm.receive(self.task_info.task_id, "input.nc") logger.info("Running predictions.") for i, (pred, path) in enumerate( zip( - model.run(batch, request.num_steps), - iterate_prediction_files("prediction.nc", request.num_steps), + model.run(batch, submission.num_steps), + iterate_prediction_files("prediction.nc", submission.num_steps), ) ): - host_comm.send(pred, self.uuid, path) - self.progress_percentage = int((100 * (i + 1)) / request.num_steps) + host_comm.send(pred, self.task_info.task_id, path) - self.completed = True + self.task_info.progress_percentage = int((100 * (i + 1)) / submission.num_steps) + + self.task_info.success = True + self.task_info.status = "Successfully completed" except Exception as exc: - self.exc = exc + self.task_info.success = False + self.task_info.status = f"Exception: {str(exc)}" + + finally: + self.task_info.completed = True def init() -> None: """Initialise. Do not load the model here, because which model we need depends on the - request.""" + submission.""" POOL.__enter__() @@ -120,34 +135,53 @@ def run(input_data: AMLRequest) -> dict: logger.info("Received request.") if input_data.method == "POST": - logger.info("Submitting new task to thread pool.") + logger.info("Creating a new task.") task = Task(Submission(**input_data.get_json())) - POOL.submit(task) - TASKS[task.uuid] = task - return SubmissionResponse(task_id=task.uuid).dict() + TASKS[task.task_info.task_id] = task + return CreationResponse(task_id=task.task_info.task_id).dict() elif input_data.method == "GET": - logger.info("Returning the status of an existing task.") + logger.info("Processing an existing task.") + task_id = input_data.args.get("task_id") if not task_id: raise Exception("Missing `task_id` query parameter.") if task_id not in TASKS: raise Exception("Task ID cannot be found.") + + task = TASKS[task_id] + + if not task.task_info.submitted: + # Attempt to submit the task if the initial condition is available. + + comm = BlobStorageCommunication(str(task.submission.data_folder_uri)) + if comm.exists(task_id, "input.nc"): + logger.info("Initial condition was found. Submitting task.") + # Send an acknowledgement back to test that the host can write. The client will + # check for this acknowledgement. + comm.write(b"", task_id, "input.nc.ack") + + # Queue the task. + task.task_info.submitted = True + task.task_info.status = "Queued" + POOL.submit(task) + + else: + logger.info("Initial condition not available. Waiting.") + + # Wait a little to prevent the client for querying too frequently. + time.sleep(3) + else: - task = TASKS[task_id] - # Allow the task some time to complete. - # We sleep here so the client does not query too frequently. + logger.info("Task still running. Waiting.") + + # Wait a little to prevent the client for querying too frequently. While waiting, + # do check for the task to be completed. for _ in range(3): - if task.completed: + if task.task_info.completed: break time.sleep(1) - return ProgressInfo( - task_id=task_id, - completed=task.completed, - progress_percentage=task.progress_percentage, - error=task.exc is not None, - error_info=str(task.exc) if task.exc else "", - ).dict() + return task.task_info.dict() raise Exception("Method not allowed.") # This branch should be unreachable. diff --git a/aurora/foundry/server/swagger3.json b/aurora/foundry/server/swagger3.json index 8c399db..f66470f 100644 --- a/aurora/foundry/server/swagger3.json +++ b/aurora/foundry/server/swagger3.json @@ -2,13 +2,13 @@ "openapi": "3.1.0", "info": { "title": "Aurora", - "description": "Evaluate Aurora Model", + "description": "Produce predictions with the Aurora model", "version": "1.0.0" }, "paths": { "/score": { "post": { - "summary": "Submit a new task", + "summary": "Create a new task", "operationId": "create_score_score_post", "security": [ { @@ -31,7 +31,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/SubmissionResponse" + "$ref": "#/components/schemas/CreationResponse" } } } @@ -49,7 +49,7 @@ } }, "get": { - "summary": "Get progress info for a given task", + "summary": "Update an existing task", "operationId": "get_score_score_get", "security": [ { @@ -81,7 +81,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ProgressInfo" + "$ref": "#/components/schemas/TaskInfo" } } } @@ -134,63 +134,42 @@ }, "components": { "schemas": { - "HTTPValidationError": { - "properties": { - "detail": { - "items": { - "$ref": "#/components/schemas/ValidationError" - }, - "type": "array", - "title": "Detail" - } - }, - "type": "object", - "title": "HTTPValidationError" - }, - "ProgressInfo": { + "CreationResponse": { "properties": { "task_id": { "type": "string", "title": "Task Id" - }, - "completed": { - "type": "boolean", - "title": "Completed" - }, - "progress_percentage": { - "type": "integer", - "title": "Progress Percentage" - }, - "error": { - "type": "boolean", - "title": "Error" - }, - "error_info": { - "type": "string", - "title": "Error Info" } }, "type": "object", "required": [ - "task_id", - "completed", - "progress_percentage", - "error", - "error_info" + "task_id" ], - "title": "ProgressInfo", + "title": "CreationResponse", "example": { - "completed": true, - "error": false, - "error_info": "", - "progress_percentage": 100, "task_id": "abc-123-def" } }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { + "$ref": "#/components/schemas/ValidationError" + }, + "type": "array", + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError" + }, "Submission": { "properties": { "data_folder_uri": { "type": "string", + "maxLength": 2083, + "minLength": 1, + "format": "uri", "title": "Data Folder Uri" }, "model_name": { @@ -215,19 +194,55 @@ "num_steps": 5 } }, - "SubmissionResponse": { + "TaskInfo": { "properties": { "task_id": { "type": "string", "title": "Task Id" + }, + "completed": { + "type": "boolean", + "title": "Completed" + }, + "progress_percentage": { + "type": "integer", + "title": "Progress Percentage" + }, + "success": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Success" + }, + "submitted": { + "type": "boolean", + "title": "Submitted" + }, + "status": { + "type": "string", + "title": "Status" } }, "type": "object", "required": [ - "task_id" + "task_id", + "completed", + "progress_percentage", + "success", + "submitted", + "status" ], - "title": "SubmissionResponse", + "title": "TaskInfo", "example": { + "completed": true, + "error_info": "Queued", + "progress_percentage": 100, + "submitted": true, "task_id": "abc-123-def" } }, From 61a543ff32fef3b8820fadd43e59bbe1bdb15ab3 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 11:59:53 +0100 Subject: [PATCH 28/44] Improve URL parsing --- tests/foundry/conftest.py | 9 ++++----- tests/foundry/docker_server_hook.py | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 8a22bb8..a338db9 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -8,6 +8,7 @@ from contextlib import contextmanager from pathlib import Path from typing import Generator +from urllib.parse import urlparse import pytest import requests @@ -87,12 +88,10 @@ def mock_foundry_client( def _matcher(request: requests.Request) -> requests.Response | None: """Mock requests that check for the existence of blobs.""" - if "blob.core.windows.net/" in request.url: - # Split off the SAS token. - path, _ = request.url.split("?", 1) - # Split off the storage account URL. - _, path = path.split("blob.core.windows.net/", 1) + url = urlparse(request.url) + path = url.path[1:] # Remove leading slash. + if url.hostname and url.hostname.endswith("blob.core.windows.net"): local_path = azcopy_mock_work_dir / path response = requests.Response() diff --git a/tests/foundry/docker_server_hook.py b/tests/foundry/docker_server_hook.py index 8c962a4..9ce36ee 100644 --- a/tests/foundry/docker_server_hook.py +++ b/tests/foundry/docker_server_hook.py @@ -3,6 +3,7 @@ import subprocess import sys from pathlib import Path +from urllib.parse import urlparse # This will be run in the release Docker image, so packages required for mocking are not available. subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "requests_mock"]) @@ -16,12 +17,10 @@ def _matcher(request: requests.Request) -> requests.Response | None: """Mock requests that check for the existence of blobs.""" - if "blob.core.windows.net/" in request.url: - # Split off the SAS token. - path, _ = request.url.split("?", 1) - # Split off the storage account URL. - _, path = path.split("blob.core.windows.net/", 1) + url = urlparse(request.url) + path = url.path[1:] # Remove leading slash. + if url.hostname and url.hostname.endswith("blob.core.windows.net"): # Assume that the local folder `/azcopy_work` is used by the mock of `azcopy`. local_path = Path("/azcopy_work") / path From 81f46e17546df4e6bab40a20aeeb6e151172b4e5 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 12:09:19 +0100 Subject: [PATCH 29/44] Only require one communication channel --- aurora/foundry/__init__.py | 4 ++-- aurora/foundry/client/api.py | 17 +++++++---------- aurora/foundry/common/channel.py | 10 +++++----- aurora/foundry/server/score.py | 14 +++++++------- docs/foundry/api.rst | 2 +- docs/foundry/submission.md | 19 ++++++++----------- tests/foundry/conftest.py | 10 ++++------ tests/foundry/docker_server_hook.py | 4 ++-- tests/foundry/runner.py | 2 +- 9 files changed, 37 insertions(+), 45 deletions(-) diff --git a/aurora/foundry/__init__.py b/aurora/foundry/__init__.py index 5f5f373..8543a8b 100644 --- a/aurora/foundry/__init__.py +++ b/aurora/foundry/__init__.py @@ -2,10 +2,10 @@ from aurora.foundry.client.api import SubmissionError, submit from aurora.foundry.client.foundry import FoundryClient -from aurora.foundry.common.channel import BlobStorageCommunication +from aurora.foundry.common.channel import BlobStorageChannel __all__ = [ - "BlobStorageCommunication", + "BlobStorageChannel", "FoundryClient", "submit", "SubmissionError", diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index 5cb7225..0e33d24 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -39,8 +39,7 @@ def submit( batch: Batch, model_name: str, num_steps: int, - client_comm: CommunicationChannel, - host_comm: CommunicationChannel, + channel: CommunicationChannel, foundry_client: AbstractFoundryClient, ) -> Generator[Batch, None, None]: """Submit a request to Azure AI Foundry and retrieve the predictions. @@ -50,10 +49,8 @@ def submit( model_name (str): Name of the model. This name must be available in :mod:`aurora_foundry.common.model`. num_steps (int): Number of prediction steps. - client_comm (:class:`aurora_foundry.common.comm.CommunicationChannel`): Channel that the - client uses to send and receive data. - host_comm (:class:`aurora_foundry.common.comm.CommunicationChannel`): Channel that the host - uses to send and receive data. + channel (:class:`aurora_foundry.common.channel.CommunicationChannel`): Channel to use for + sending and receiving data. foundry_client (:class:`aurora_foundry.client.foundry.AbstractFoundryClient`): Client to communicate with Azure Foundry AI. @@ -67,7 +64,7 @@ def submit( task = { "model_name": model_name, "num_steps": num_steps, - "data_folder_uri": host_comm.to_spec(), + "data_folder_uri": channel.to_spec(), } response = foundry_client.submit_task(task) try: @@ -78,7 +75,7 @@ def submit( logger.info(f"Created task `{task_id}` at endpoint.") # Send the initial condition over. - client_comm.send(batch, task_id, "input.nc") + channel.send(batch, task_id, "input.nc") previous_status: str = "No Status" previous_progress: int = 0 @@ -93,7 +90,7 @@ def submit( # If the task has been submitted, we must be able to read the acknowledgement of the # initial condition. try: - client_comm.read(task_id, "input.nc.ack", timeout=120) + channel.read(task_id, "input.nc.ack", timeout=120) except TimeoutError as e: raise SubmissionError("Could not read acknowledgement of initial condition.") from e @@ -114,4 +111,4 @@ def submit( logger.info("Retrieving predictions.") for prediction_name in iterate_prediction_files("prediction.nc", num_steps): - yield client_comm.receive(task_id, prediction_name) + yield channel.receive(task_id, prediction_name) diff --git a/aurora/foundry/common/channel.py b/aurora/foundry/common/channel.py index d1bfd5b..823903e 100644 --- a/aurora/foundry/common/channel.py +++ b/aurora/foundry/common/channel.py @@ -16,7 +16,7 @@ __all__ = [ "CommunicationChannel", - "BlobStorageCommunication", + "BlobStorageChannel", "iterate_prediction_files", ] @@ -178,7 +178,7 @@ def to_spec(self) -> str: """ -class BlobStorageCommunication(CommunicationChannel): +class BlobStorageChannel(CommunicationChannel): """A communication channel via a folder in a Azure Blob Storage container.""" _AZCOPY_EXECUTABLE: list[str] = ["azcopy"] @@ -198,11 +198,11 @@ def to_spec(self) -> str: return self.blob_folder class Spec(BaseModel): - class_name: Literal["BlobStorageCommunication"] + class_name: Literal["BlobStorageChannel"] blob_folder: HttpUrl # TODO: Can we force this to be `https`? - def construct(self) -> "BlobStorageCommunication": - return BlobStorageCommunication(blob_folder=str(self.blob_folder)) + def construct(self) -> "BlobStorageChannel": + return BlobStorageChannel(blob_folder=str(self.blob_folder)) def _blob_path(self, name: str) -> str: """For a given file name `name`, get the full path including the SAS token. diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index 3ad2f4f..af47d91 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -10,7 +10,7 @@ import aurora.foundry.server._hook # noqa: F401 from aurora.foundry.common.channel import ( - BlobStorageCommunication, + BlobStorageChannel, iterate_prediction_files, ) from aurora.foundry.common.model import models @@ -87,12 +87,12 @@ def __call__(self) -> None: try: submission = self.submission - host_comm = BlobStorageCommunication(str(submission.data_folder_uri)) + channel = BlobStorageChannel(str(submission.data_folder_uri)) model_class = models[submission.model_name] model = model_class() - batch = host_comm.receive(self.task_info.task_id, "input.nc") + batch = channel.receive(self.task_info.task_id, "input.nc") logger.info("Running predictions.") for i, (pred, path) in enumerate( @@ -101,7 +101,7 @@ def __call__(self) -> None: iterate_prediction_files("prediction.nc", submission.num_steps), ) ): - host_comm.send(pred, self.task_info.task_id, path) + channel.send(pred, self.task_info.task_id, path) self.task_info.progress_percentage = int((100 * (i + 1)) / submission.num_steps) @@ -154,12 +154,12 @@ def run(input_data: AMLRequest) -> dict: if not task.task_info.submitted: # Attempt to submit the task if the initial condition is available. - comm = BlobStorageCommunication(str(task.submission.data_folder_uri)) - if comm.exists(task_id, "input.nc"): + channel = BlobStorageChannel(str(task.submission.data_folder_uri)) + if channel.exists(task_id, "input.nc"): logger.info("Initial condition was found. Submitting task.") # Send an acknowledgement back to test that the host can write. The client will # check for this acknowledgement. - comm.write(b"", task_id, "input.nc.ack") + channel.write(b"", task_id, "input.nc.ack") # Queue the task. task.task_info.submitted = True diff --git a/docs/foundry/api.rst b/docs/foundry/api.rst index 1138439..eeeae0a 100644 --- a/docs/foundry/api.rst +++ b/docs/foundry/api.rst @@ -8,7 +8,7 @@ Submission .. autoclass:: aurora.foundry.client.foundry.FoundryClient :members: __init__ -.. autoclass:: aurora.foundry.common.channel.BlobStorageCommunication +.. autoclass:: aurora.foundry.common.channel.BlobStorageChannel :members: __init__ diff --git a/docs/foundry/submission.md b/docs/foundry/submission.md index a697314..1763850 100644 --- a/docs/foundry/submission.md +++ b/docs/foundry/submission.md @@ -1,6 +1,6 @@ # Submitting Predictions -To produce predictions on Azure AI Foundry, the client will communicate through +To produce predictions on Azure AI Foundry, the client will communicate with the host through a blob storage container, so `azcopy` needs to be available in the local path. [See here for instructions on how to install `azcopy`.](https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10) @@ -15,21 +15,19 @@ foundry_client = FoundryClient( ) ``` -Then set up a way to communicate with the model running on Foundry. -You likely want to send data back and forth via a folder in a blob storage container: +Then set up a blob storage container for communication with the host: ```python -from aurora.foundry import BlobStorageCommunication +from aurora.foundry import BlobStorageChannel -communication = BlobStorageCommunication( +channel = BlobStorageChannel( "https://my.blob.core.windows.net/container/folder?" ) ``` -The SAS token needs read, write, and list rights. -This API does not automatically delete the model initial condition and predictions that are -uploaded to the blob storage folder. -You will need to do that yourself. +The SAS token needs read and write rights. +This blob storage container will be used to send the initial condition to the host and to retrieve +the predictions from the host. You can now submit requests in the following way: @@ -58,9 +56,8 @@ for pred in submit( batch=initial_condition, model_name="aurora-0.25-small-pretrained", num_steps=4, - client=communication, - host=communication, foundry_client=foundry_client, + channel=communication, ): pass # Do something with `pred`. ``` diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index a338db9..bcafa6b 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -14,7 +14,7 @@ import requests from aurora.foundry.client.foundry import FoundryClient -from aurora.foundry.common.channel import BlobStorageCommunication +from aurora.foundry.common.channel import BlobStorageChannel MOCK_ADDRESS = "https://mock-foundry.azurewebsites.net" @@ -79,7 +79,7 @@ def mock_foundry_client( azcopy_mock_work_dir.mkdir(exist_ok=True, parents=True) azcopy_path = Path(__file__).parents[0] / "azcopy.py" monkeypatch.setattr( - BlobStorageCommunication, + BlobStorageChannel, "_AZCOPY_EXECUTABLE", ["python", str(azcopy_path), str(azcopy_mock_work_dir)], ) @@ -109,8 +109,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: with runner_process(azcopy_mock_work_dir) as (p, stdin, stdout): # noqa: SIM117 with mock_foundry_responses_subprocess(stdin, stdout, requests_mock): yield { - "client_comm": BlobStorageCommunication(blob_url_with_sas), - "host_comm": BlobStorageCommunication(blob_url_with_sas), + "channel": BlobStorageChannel(blob_url_with_sas), "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), } @@ -166,8 +165,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: break yield { - "client_comm": BlobStorageCommunication(blob_url_with_sas), - "host_comm": BlobStorageCommunication(blob_url_with_sas), + "channel": BlobStorageChannel(blob_url_with_sas), "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), } diff --git a/tests/foundry/docker_server_hook.py b/tests/foundry/docker_server_hook.py index 9ce36ee..67c5536 100644 --- a/tests/foundry/docker_server_hook.py +++ b/tests/foundry/docker_server_hook.py @@ -38,10 +38,10 @@ def _matcher(request: requests.Request) -> requests.Response | None: mock.real_http = True mock.add_matcher(_matcher) -from aurora.foundry.common.channel import BlobStorageCommunication # noqa: E402 +from aurora.foundry.common.channel import BlobStorageChannel # noqa: E402 # Second, mock `azcopy`, assuming that the `azcopy` mock working directory is `/azcopy_work`. -BlobStorageCommunication._AZCOPY_EXECUTABLE = [ +BlobStorageChannel._AZCOPY_EXECUTABLE = [ "python", "/aurora_foundry/azcopy.py", "/azcopy_work", diff --git a/tests/foundry/runner.py b/tests/foundry/runner.py index e7035cb..c71fc42 100644 --- a/tests/foundry/runner.py +++ b/tests/foundry/runner.py @@ -47,7 +47,7 @@ def main(azcopy_mock_work_path: Path, path: Path) -> None: # At this point, we mock `azcopy` too. azcopy_path = Path(__file__).parents[0] / "azcopy.py" - sys.modules["aurora.foundry"].BlobStorageCommunication._AZCOPY_EXECUTABLE = [ + sys.modules["aurora.foundry"].BlobStorageChannel._AZCOPY_EXECUTABLE = [ "python", str(azcopy_path), str(azcopy_mock_work_path), From dc29d3877d979e1fad7883792405709ed717af61 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 12:13:00 +0100 Subject: [PATCH 30/44] Simplify CI pipeline --- .github/workflows/ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d9f4fab..04949f6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,7 +28,7 @@ jobs: - name: Build Foundry image run: | - DOCKER_IMAGE=aurora-foundry make docker + make docker - name: Install dependencies run: | @@ -37,4 +37,4 @@ jobs: - name: Run tests run: | - DOCKER_IMAGE=aurora-foundry pytest -v --cov=aurora --cov-report term-missing + make test From 472cbbf5585247dc6700af05af2014740d1b524f Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 12:22:15 +0100 Subject: [PATCH 31/44] Fix url parsing --- tests/foundry/conftest.py | 2 +- tests/foundry/docker_server_hook.py | 2 +- tests/foundry/runner.py | 11 ++++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index bcafa6b..8b73459 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -91,7 +91,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: url = urlparse(request.url) path = url.path[1:] # Remove leading slash. - if url.hostname and url.hostname.endswith("blob.core.windows.net"): + if url.hostname and url.hostname.endswith(".blob.core.windows.net"): local_path = azcopy_mock_work_dir / path response = requests.Response() diff --git a/tests/foundry/docker_server_hook.py b/tests/foundry/docker_server_hook.py index 67c5536..52867b6 100644 --- a/tests/foundry/docker_server_hook.py +++ b/tests/foundry/docker_server_hook.py @@ -20,7 +20,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: url = urlparse(request.url) path = url.path[1:] # Remove leading slash. - if url.hostname and url.hostname.endswith("blob.core.windows.net"): + if url.hostname and url.hostname.endswith(".blob.core.windows.net"): # Assume that the local folder `/azcopy_work` is used by the mock of `azcopy`. local_path = Path("/azcopy_work") / path diff --git a/tests/foundry/runner.py b/tests/foundry/runner.py index c71fc42..a88c964 100644 --- a/tests/foundry/runner.py +++ b/tests/foundry/runner.py @@ -8,6 +8,7 @@ import logging import sys from pathlib import Path +from urllib.parse import urlparse import click import requests @@ -55,11 +56,11 @@ def main(azcopy_mock_work_path: Path, path: Path) -> None: def _matcher(request: requests.Request) -> requests.Response | None: """Mock requests that check for the existence of blobs.""" - if "blob.core.windows.net/" in request.url: - # Split off the SAS token. - path, _ = request.url.split("?", 1) - # Split off the storage account URL. - _, path = path.split("blob.core.windows.net/", 1) + url = urlparse(request.url) + path = url.path[1:] # Remove leading slash. + + if url.hostname and url.hostname.endswith(".blob.core.windows.net"): + local_path = azcopy_mock_work_path / path local_path = azcopy_mock_work_path / path From 23ceb538d544b707f0cb1643c1766ff0ad8f8dd1 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 12:30:13 +0100 Subject: [PATCH 32/44] Add some types --- tests/foundry/conftest.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 8b73459..28bf935 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -7,7 +7,7 @@ import time from contextlib import contextmanager from pathlib import Path -from typing import Generator +from typing import IO, Generator, Tuple from urllib.parse import urlparse import pytest @@ -20,7 +20,10 @@ @contextmanager -def runner_process(azcopy_mock_work_dir: Path): +def runner_process( + azcopy_mock_work_dir: Path, +) -> Generator[Tuple[subprocess.Popen, IO, IO], None, None]: + """Launch a runner process that mocks the Azure ML Inference Server.""" score_script_path = Path(__file__).parents[2] / "aurora/foundry/server/score.py" runner_path = Path(__file__).parents[0] / "runner.py" p = subprocess.Popen( @@ -37,7 +40,11 @@ def runner_process(azcopy_mock_work_dir: Path): @contextmanager -def mock_foundry_responses_subprocess(stdin, stdout, requests_mock, base_address=MOCK_ADDRESS): +def mock_foundry_responses_subprocess( + stdin: IO, stdout: IO, requests_mock, base_address: str = MOCK_ADDRESS +) -> Generator[None, None, None]: + """Mock requests to Foundry by redirecting them to the subprocess.""" + def _mock_send(request, context) -> dict: method = request.method.encode("unicode_escape") text = request.text or "" From ad8769586fe79ea5886a19646bf4d451f72d6ccc Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 12:48:58 +0100 Subject: [PATCH 33/44] Add test that uses a real blob storage container --- tests/foundry/conftest.py | 72 +++++++++++++++++++++++------ tests/foundry/docker_server_hook.py | 17 +++---- tests/foundry/runner.py | 58 ++++++++++++----------- 3 files changed, 98 insertions(+), 49 deletions(-) diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index 28bf935..e41741d 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -21,13 +21,22 @@ @contextmanager def runner_process( - azcopy_mock_work_dir: Path, + azcopy_mock_work_path: Path | None, ) -> Generator[Tuple[subprocess.Popen, IO, IO], None, None]: """Launch a runner process that mocks the Azure ML Inference Server.""" score_script_path = Path(__file__).parents[2] / "aurora/foundry/server/score.py" runner_path = Path(__file__).parents[0] / "runner.py" p = subprocess.Popen( - ["python", runner_path, azcopy_mock_work_dir, score_script_path], + [ + "python", + runner_path, + *( + ["--azcopy-mock-work-path", str(azcopy_mock_work_path)] + if azcopy_mock_work_path + else [] + ), + score_script_path, + ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, ) @@ -72,23 +81,18 @@ def _mock_send(request, context) -> dict: yield -@pytest.fixture(params=["subprocess", "docker"]) -def mock_foundry_client( - request, - monkeypatch, - requests_mock, - tmp_path: Path, -) -> Generator[dict, None, None]: +def mock_azcopy(tmp_path: Path, monkeypatch, requests_mock) -> Tuple[Path, Path, str]: + """Mock `azcopy`.""" # Communicate via blob storage, so mock `azcopy` too. - azcopy_mock_work_dir = tmp_path / "azcopy_work" + azcopy_mock_work_path = tmp_path / "azcopy_work" # It's important to already create the work folder. If we don't then the Docker image will # create it, and the permissions will then be wrong. - azcopy_mock_work_dir.mkdir(exist_ok=True, parents=True) + azcopy_mock_work_path.mkdir(exist_ok=True, parents=True) azcopy_path = Path(__file__).parents[0] / "azcopy.py" monkeypatch.setattr( BlobStorageChannel, "_AZCOPY_EXECUTABLE", - ["python", str(azcopy_path), str(azcopy_mock_work_dir)], + ["python", str(azcopy_path), str(azcopy_mock_work_path)], ) # The below test URL must start with `https`! blob_url_with_sas = "https://storageaccount.blob.core.windows.net/container/folder?SAS" @@ -99,7 +103,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: path = url.path[1:] # Remove leading slash. if url.hostname and url.hostname.endswith(".blob.core.windows.net"): - local_path = azcopy_mock_work_dir / path + local_path = azcopy_mock_work_path / path response = requests.Response() if local_path.exists(): @@ -112,8 +116,42 @@ def _matcher(request: requests.Request) -> requests.Response | None: requests_mock.add_matcher(_matcher) + return azcopy_path, azcopy_mock_work_path, blob_url_with_sas + + +@pytest.fixture( + params=[ + "subprocess", + "subprocess-real-container", + "docker", + ] +) +def mock_foundry_client( + request, + tmp_path: Path, + monkeypatch, + requests_mock, +) -> Generator[dict, None, None]: if request.param == "subprocess": - with runner_process(azcopy_mock_work_dir) as (p, stdin, stdout): # noqa: SIM117 + azcopy_path, azcopy_mock_work_path, blob_url_with_sas = mock_azcopy( + tmp_path, monkeypatch, requests_mock + ) + + with runner_process(azcopy_mock_work_path) as (p, stdin, stdout): # noqa: SIM117 + with mock_foundry_responses_subprocess(stdin, stdout, requests_mock): + yield { + "channel": BlobStorageChannel(blob_url_with_sas), + "foundry_client": FoundryClient(MOCK_ADDRESS, "mock-token"), + } + + elif request.param == "subprocess-real-container": + requests_mock.real_http = True + + if "TEST_BLOB_URL_WITH_SAS" not in os.environ: + pytest.skip("`TEST_BLOB_URL_WITH_SAS` is not set, so test cannot be run.") + blob_url_with_sas = os.environ["TEST_BLOB_URL_WITH_SAS"] + + with runner_process(None) as (p, stdin, stdout): # noqa: SIM117 with mock_foundry_responses_subprocess(stdin, stdout, requests_mock): yield { "channel": BlobStorageChannel(blob_url_with_sas), @@ -121,6 +159,10 @@ def _matcher(request: requests.Request) -> requests.Response | None: } elif request.param == "docker": + azcopy_path, azcopy_mock_work_path, blob_url_with_sas = mock_azcopy( + tmp_path, monkeypatch, requests_mock + ) + requests_mock.real_http = True if "DOCKER_IMAGE" not in os.environ: @@ -142,7 +184,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: "--rm", "-t", "-v", - f"{azcopy_mock_work_dir}:/azcopy_work", + f"{azcopy_mock_work_path}:/azcopy_work", "--mount", f"type=bind,src={azcopy_path},dst=/aurora_foundry/azcopy.py,readonly", "--mount", diff --git a/tests/foundry/docker_server_hook.py b/tests/foundry/docker_server_hook.py index 52867b6..ae311db 100644 --- a/tests/foundry/docker_server_hook.py +++ b/tests/foundry/docker_server_hook.py @@ -38,11 +38,12 @@ def _matcher(request: requests.Request) -> requests.Response | None: mock.real_http = True mock.add_matcher(_matcher) -from aurora.foundry.common.channel import BlobStorageChannel # noqa: E402 - -# Second, mock `azcopy`, assuming that the `azcopy` mock working directory is `/azcopy_work`. -BlobStorageChannel._AZCOPY_EXECUTABLE = [ - "python", - "/aurora_foundry/azcopy.py", - "/azcopy_work", -] +if Path("/azcopy_work").exists(): + from aurora.foundry.common.channel import BlobStorageChannel # noqa: E402 + + # Second, mock `azcopy`, assuming that the `azcopy` mock working directory is `/azcopy_work`. + BlobStorageChannel._AZCOPY_EXECUTABLE = [ + "python", + "/aurora_foundry/azcopy.py", + "/azcopy_work", + ] diff --git a/tests/foundry/runner.py b/tests/foundry/runner.py index a88c964..50c5e9b 100644 --- a/tests/foundry/runner.py +++ b/tests/foundry/runner.py @@ -8,6 +8,7 @@ import logging import sys from pathlib import Path +from typing import Callable from urllib.parse import urlparse import click @@ -26,9 +27,8 @@ @click.command() -@click.argument( - "azcopy_mock_work_path", - required=True, +@click.option( + "--azcopy-mock-work-path", type=click.Path( exists=False, file_okay=False, dir_okay=True, resolve_path=True, path_type=Path ), @@ -38,7 +38,7 @@ required=True, type=click.Path(exists=True, file_okay=True, dir_okay=False, resolve_path=True, path_type=Path), ) -def main(azcopy_mock_work_path: Path, path: Path) -> None: +def main(azcopy_mock_work_path: Path | None, path: Path) -> None: spec = util.spec_from_file_location("score", path) assert spec is not None, "Could not load specification." score = util.module_from_spec(spec) @@ -46,36 +46,42 @@ def main(azcopy_mock_work_path: Path, path: Path) -> None: assert spec.loader is not None, "Specification has no loader." spec.loader.exec_module(score) - # At this point, we mock `azcopy` too. - azcopy_path = Path(__file__).parents[0] / "azcopy.py" - sys.modules["aurora.foundry"].BlobStorageChannel._AZCOPY_EXECUTABLE = [ - "python", - str(azcopy_path), - str(azcopy_mock_work_path), - ] + if azcopy_mock_work_path: + # At this point, we mock `azcopy` too. + azcopy_path = Path(__file__).parents[0] / "azcopy.py" + sys.modules["aurora.foundry"].BlobStorageChannel._AZCOPY_EXECUTABLE = [ + "python", + str(azcopy_path), + str(azcopy_mock_work_path), + ] + + _matcher: Callable[[requests.Request], requests.Response | None] | None - def _matcher(request: requests.Request) -> requests.Response | None: - """Mock requests that check for the existence of blobs.""" - url = urlparse(request.url) - path = url.path[1:] # Remove leading slash. + def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + url = urlparse(request.url) + path = url.path[1:] # Remove leading slash. - if url.hostname and url.hostname.endswith(".blob.core.windows.net"): - local_path = azcopy_mock_work_path / path + if url.hostname and url.hostname.endswith(".blob.core.windows.net"): + local_path = azcopy_mock_work_path / path - local_path = azcopy_mock_work_path / path + local_path = azcopy_mock_work_path / path - response = requests.Response() - if local_path.exists(): - response.status_code = 200 - else: - response.status_code = 404 - return response + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response - return None + return None + else: + _matcher = None with requests_mock.Mocker() as mock: mock.real_http = True - mock.add_matcher(_matcher) + if _matcher is not None: + mock.add_matcher(_matcher) score.init() From 993f8fd207be77097a028428380d2b3cee0fd4fb Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 12:58:07 +0100 Subject: [PATCH 34/44] Add Docker test for an actual container --- tests/foundry/conftest.py | 63 +++++++++++++++++++++++++++++ tests/foundry/docker_server_hook.py | 51 ++++++++++++----------- 2 files changed, 88 insertions(+), 26 deletions(-) diff --git a/tests/foundry/conftest.py b/tests/foundry/conftest.py index e41741d..28887a9 100644 --- a/tests/foundry/conftest.py +++ b/tests/foundry/conftest.py @@ -124,6 +124,7 @@ def _matcher(request: requests.Request) -> requests.Response | None: "subprocess", "subprocess-real-container", "docker", + "docker-real-container", ] ) def mock_foundry_client( @@ -197,6 +198,68 @@ def mock_foundry_client( docker_image, ], ) + + try: + # Wait for the server to come online. + start = time.time() + while True: + try: + res = requests.get("http://127.0.0.1:5001/") + res.raise_for_status() + except (requests.ConnectionError, requests.HTTPError) as e: + # Try for at most 10 seconds. + if time.time() - start < 10: + time.sleep(0.5) + continue + else: + raise e + break + + yield { + "channel": BlobStorageChannel(blob_url_with_sas), + "foundry_client": FoundryClient("http://127.0.0.1:5001", "mock-token"), + } + + finally: + p.terminate() + p.wait() + + elif request.param == "docker-real-container": + requests_mock.real_http = True + + if "TEST_BLOB_URL_WITH_SAS" not in os.environ: + pytest.skip("`TEST_BLOB_URL_WITH_SAS` is not set, so test cannot be run.") + blob_url_with_sas = os.environ["TEST_BLOB_URL_WITH_SAS"] + + if "DOCKER_IMAGE" not in os.environ: + raise RuntimeError( + "Set the environment variable `DOCKER_IMAGE` " + "to the release image of Aurora Foundry." + ) + docker_image = os.environ["DOCKER_IMAGE"] + + # Run the Docker container. Assume that it has already been built. Insert the hook + # to mock things on the server side. + server_hook = Path(__file__).parents[0] / "docker_server_hook.py" + p = subprocess.Popen( + [ + "docker", + "run", + "-p", + "5001:5001", + "--rm", + "-t", + "--mount", + ( + f"type=bind" + f",src={server_hook}" + f",dst=/aurora_foundry/aurora/foundry/server/_hook.py" + f",readonly" + ), + docker_image, + ], + ) + try: # Wait for the server to come online. start = time.time() diff --git a/tests/foundry/docker_server_hook.py b/tests/foundry/docker_server_hook.py index ae311db..e921820 100644 --- a/tests/foundry/docker_server_hook.py +++ b/tests/foundry/docker_server_hook.py @@ -5,40 +5,39 @@ from pathlib import Path from urllib.parse import urlparse -# This will be run in the release Docker image, so packages required for mocking are not available. -subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "requests_mock"]) - - -import requests # noqa: E402 -import requests_mock # noqa: E402 - -# First, mock requests that check for the existence of blobs. +# If `/azcopy_work` does not exist, nothing needs to be mocked. +if Path("/azcopy_work").exists(): + # This will be run in the release Docker image, so packages required for mocking are not + # available. + subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "requests_mock"]) + import requests # noqa: E402 + import requests_mock # noqa: E402 -def _matcher(request: requests.Request) -> requests.Response | None: - """Mock requests that check for the existence of blobs.""" - url = urlparse(request.url) - path = url.path[1:] # Remove leading slash. + # First, mock requests that check for the existence of blobs. - if url.hostname and url.hostname.endswith(".blob.core.windows.net"): - # Assume that the local folder `/azcopy_work` is used by the mock of `azcopy`. - local_path = Path("/azcopy_work") / path + def _matcher(request: requests.Request) -> requests.Response | None: + """Mock requests that check for the existence of blobs.""" + url = urlparse(request.url) + path = url.path[1:] # Remove leading slash. - response = requests.Response() - if local_path.exists(): - response.status_code = 200 - else: - response.status_code = 404 - return response + if url.hostname and url.hostname.endswith(".blob.core.windows.net"): + # Assume that the local folder `/azcopy_work` is used by the mock of `azcopy`. + local_path = Path("/azcopy_work") / path - return None + response = requests.Response() + if local_path.exists(): + response.status_code = 200 + else: + response.status_code = 404 + return response + return None -mock = requests_mock.Mocker().__enter__() -mock.real_http = True -mock.add_matcher(_matcher) + mock = requests_mock.Mocker().__enter__() + mock.real_http = True + mock.add_matcher(_matcher) -if Path("/azcopy_work").exists(): from aurora.foundry.common.channel import BlobStorageChannel # noqa: E402 # Second, mock `azcopy`, assuming that the `azcopy` mock working directory is `/azcopy_work`. From d33c147f418b859bb73b0dd7268aa71a7dbe889c Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 13:11:12 +0100 Subject: [PATCH 35/44] Remove deps from notebooks --- docs/example_era5.ipynb | 2 +- docs/example_hres_0.1.ipynb | 2 +- docs/example_hres_t0.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/example_era5.ipynb b/docs/example_era5.ipynb index 85c8f29..19de4ee 100644 --- a/docs/example_era5.ipynb +++ b/docs/example_era5.ipynb @@ -12,7 +12,7 @@ "Running this notebook requires additional Python packages. You can install these as follows:\n", "\n", "```\n", - "pip install cdsapi xarray netcdf4 matplotlib\n", + "pip install cdsapi matplotlib\n", "```\n", "\n", "## Downloading the Data\n", diff --git a/docs/example_hres_0.1.ipynb b/docs/example_hres_0.1.ipynb index 8ef076e..20e42f0 100644 --- a/docs/example_hres_0.1.ipynb +++ b/docs/example_hres_0.1.ipynb @@ -12,7 +12,7 @@ "Running this notebook requires additional Python packages. You can install these as follows:\n", "\n", "```\n", - "pip install requests xarray cfgrib scipy matplotlib\n", + "pip install requests cfgrib matplotlib\n", "```\n", "\n", "\n", diff --git a/docs/example_hres_t0.ipynb b/docs/example_hres_t0.ipynb index d765158..bd192c6 100644 --- a/docs/example_hres_t0.ipynb +++ b/docs/example_hres_t0.ipynb @@ -12,7 +12,7 @@ "Running this notebook requires additional Python packages. You can install these as follows:\n", "\n", "```\n", - "pip install gcsfs cdsapi xarray zarr netcdf4 matplotlib\n", + "pip install gcsfs cdsapi zarr matplotlib\n", "```\n", "\n", "## Downloading the Data\n", From 27321cdbcfbc1b0611877270907cf8ef18c7a44c Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 13:41:24 +0100 Subject: [PATCH 36/44] Add outline of demo --- aurora/foundry/demo/__init__.py | 1 + aurora/foundry/demo/hres_t0_data.py | 140 ++++++++++++++++++++++++++++ docs/_toc.yml | 1 + docs/foundry/demo_hres_t0.ipynb | 124 ++++++++++++++++++++++++ docs/foundry/submission.md | 2 +- 5 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 aurora/foundry/demo/__init__.py create mode 100644 aurora/foundry/demo/hres_t0_data.py create mode 100644 docs/foundry/demo_hres_t0.ipynb diff --git a/aurora/foundry/demo/__init__.py b/aurora/foundry/demo/__init__.py new file mode 100644 index 0000000..a679a52 --- /dev/null +++ b/aurora/foundry/demo/__init__.py @@ -0,0 +1 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" diff --git a/aurora/foundry/demo/hres_t0_data.py b/aurora/foundry/demo/hres_t0_data.py new file mode 100644 index 0000000..b70238a --- /dev/null +++ b/aurora/foundry/demo/hres_t0_data.py @@ -0,0 +1,140 @@ +"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.""" + +import pickle +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +import fsspec +import numpy as np +import torch +import xarray as xr +from huggingface_hub import hf_hub_download + +from aurora import Batch, Metadata + + +def load_batch(day: datetime = datetime(2022, 5, 11), cache_path: str = "~/downloads") -> Batch: + """Download and load an HRES T0 batch for UTC 12 on `day`. + + Automatically installs any required dependencies. + + Caches the data at `cache_path`. + + Requires no authentication. + + Args: + day (datetime, optional): Day to download and load a batch for. Defaults to 5 Nov 2022. + cache_path (str, optional): Path to cache the downloads at. + + Return: + :class:`aurora.batch.Batch`: Batch. + """ + return _load_batch(day.strftime("%Y-%m-%d"), Path(cache_path)) + + +def _load_batch(day: str, cache_path: Path) -> Batch: + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "fsspec", "gcsfs", "zarr", "matplotlib"], + stdout=subprocess.DEVNULL, + ) + + cache_path = cache_path.expanduser() + cache_path.mkdir(parents=True, exist_ok=True) + + # We will download from Google Cloud. + url = "gs://weatherbench2/datasets/hres_t0/2016-2022-6h-1440x721.zarr" + ds: xr.Dataset | None = None + + # Download the surface-level variables. + if not (cache_path / f"{day}-surface-level.nc").exists(): + ds = ds or xr.open_zarr(fsspec.get_mapper(url), chunks=None) + surface_vars = [ + "10m_u_component_of_wind", + "10m_v_component_of_wind", + "2m_temperature", + "mean_sea_level_pressure", + ] + ds_surf = ds[surface_vars].sel(time=day).compute() + ds_surf.to_netcdf(str(cache_path / f"{day}-surface-level.nc")) + + # Download the static variables. + if not (cache_path / "static.nc").exists(): + path = hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-static.pickle") + with open(path, "rb") as f: + static_vars = pickle.load(f) + ds_static = xr.Dataset( + data_vars={k: (["lattitude", "longitude"], v) for k, v in static_vars.items()}, + coords={ + "latitude": ("latitude", np.linspace(90, -90, 721)), + "longitude": ("longitude", np.linspace(0, 360, 1440, endpoint=False)), + }, + ) + ds_static.to_netcdf(str(cache_path / "static.nc")) + + # Download the atmospheric variables. + if not (cache_path / f"{day}-atmospheric.nc").exists(): + ds = ds or xr.open_zarr(fsspec.get_mapper(url), chunks=None) + atmos_vars = [ + "temperature", + "u_component_of_wind", + "v_component_of_wind", + "specific_humidity", + "geopotential", + ] + ds_atmos = ds[atmos_vars].sel(time=day).compute() + ds_atmos.to_netcdf(str(cache_path / f"{day}-atmospheric.nc")) + + static_vars_ds = xr.open_dataset(cache_path / "static.nc", engine="netcdf4") + surf_vars_ds = xr.open_dataset(cache_path / f"{day}-surface-level.nc", engine="netcdf4") + atmos_vars_ds = xr.open_dataset(cache_path / f"{day}-atmospheric.nc", engine="netcdf4") + + i = 2 # Select this time index in the downloaded data. + + def _prepare(x: np.ndarray) -> torch.Tensor: + """Prepare a variable. + + This does the following things: + * Select time indices `i` and `i - 1`. + * Insert an empty batch dimension with `[None]`. + * Flip along the latitude axis to ensure that the latitudes are decreasing. + * Copy the data, because the data must be contiguous when converting to PyTorch. + * Convert to PyTorch. + """ + return torch.from_numpy(x[[i - 1, i]][None][..., ::-1, :].copy()) + + return Batch( + surf_vars={ + "2t": _prepare(surf_vars_ds["2m_temperature"].values), + "10u": _prepare(surf_vars_ds["10m_u_component_of_wind"].values), + "10v": _prepare(surf_vars_ds["10m_v_component_of_wind"].values), + "msl": _prepare(surf_vars_ds["mean_sea_level_pressure"].values), + }, + static_vars={ + # The static variables are constant, so we just get them for the first time. They + # don't need to be flipped along the latitude dimension, because they are from + # ERA5. + "z": torch.from_numpy(static_vars_ds["z"].values[0]), + "slt": torch.from_numpy(static_vars_ds["slt"].values[0]), + "lsm": torch.from_numpy(static_vars_ds["lsm"].values[0]), + }, + atmos_vars={ + "t": _prepare(atmos_vars_ds["temperature"].values), + "u": _prepare(atmos_vars_ds["u_component_of_wind"].values), + "v": _prepare(atmos_vars_ds["v_component_of_wind"].values), + "q": _prepare(atmos_vars_ds["specific_humidity"].values), + "z": _prepare(atmos_vars_ds["geopotential"].values), + }, + metadata=Metadata( + # Flip the latitudes! We need to copy because converting to PyTorch, because the + # data must be contiguous. + lat=torch.from_numpy(surf_vars_ds.latitude.values[::-1].copy()), + lon=torch.from_numpy(surf_vars_ds.longitude.values), + # Converting to `datetime64[s]` ensures that the output of `tolist()` gives + # `datetime.datetime`s. Note that this needs to be a tuple of length one: + # one value for every batch element. + time=(surf_vars_ds.time.values.astype("datetime64[s]").tolist()[i],), + atmos_levels=tuple(int(level) for level in atmos_vars_ds.level.values), + ), + ) diff --git a/docs/_toc.yml b/docs/_toc.yml index b22099d..bf6dc58 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -20,4 +20,5 @@ parts: - file: foundry/intro - file: foundry/submission - file: foundry/server + - file: foundry/demo_hres_t0 - file: foundry/api diff --git a/docs/foundry/demo_hres_t0.ipynb b/docs/foundry/demo_hres_t0.ipynb new file mode 100644 index 0000000..a483f6f --- /dev/null +++ b/docs/foundry/demo_hres_t0.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2d2d6659-d16f-4736-91b0-16efdfa58147", + "metadata": {}, + "source": [ + "# Demo: HRES T0" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bba46076-2b4f-452f-b694-ee108eae839d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: '/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/torchvision/image.so: undefined symbol: _ZN3c1017RegisterOperatorsD1Ev'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n", + " warn(\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "torchdata 0.6.1 requires torch==2.0.1, but you have torch 2.2.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "from aurora.foundry.demo.hres_t0_data import load_batch\n", + "\n", + "initial_condition = load_batch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07fd311b-54c0-4144-beb6-4a81d2056507", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "import torch\n", + "\n", + "from aurora import Batch, Metadata\n", + "from aurora.foundry import BlobStorageChannel, FoundryClient, submit\n", + "\n", + "foundry_client = FoundryClient(\n", + " endpoint=\"https://endpoint/\",\n", + " token=\"TOKEN\",\n", + ")\n", + "channel = BlobStorageChannel(\"https://my.blob.core.windows.net/container/folder?\")\n", + "\n", + "initial_condition = Batch(\n", + " surf_vars={k: torch.randn(1, 2, 17, 32) for k in (\"2t\", \"10u\", \"10v\", \"msl\")},\n", + " static_vars={k: torch.randn(17, 32) for k in (\"lsm\", \"z\", \"slt\")},\n", + " atmos_vars={k: torch.randn(1, 2, 4, 17, 32) for k in (\"z\", \"u\", \"v\", \"t\", \"q\")},\n", + " metadata=Metadata(\n", + " lat=torch.linspace(90, -90, 17),\n", + " lon=torch.linspace(0, 360, 32 + 1)[:-1],\n", + " time=(datetime(2020, 6, 1, 12, 0),),\n", + " atmos_levels=(100, 250, 500, 850),\n", + " ),\n", + ")\n", + "\n", + "predictions = list(\n", + " submit(\n", + " initial_condition,\n", + " model_name=\"aurora-0.25-finetuned\",\n", + " num_steps=4,\n", + " foundry_client=foundry_client,\n", + " channel=channel,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ea3d81c-6932-46f6-a91b-6fc79d549b29", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots(2, 2, figsize=(12, 6.5))\n", + "\n", + "for i in range(ax.shape[0]):\n", + " pred = predictions[i]\n", + "\n", + " ax[i, 0].imshow(pred.surf_vars[\"2t\"][0, 0].numpy() - 273.15, vmin=-50, vmax=50)\n", + " ax[i, 0].set_title(str(pred.metadata.time[0]))\n", + " ax[i, 0].set_xticks([])\n", + " ax[i, 0].set_yticks([])\n", + "\n", + "plt.tight_layout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/foundry/submission.md b/docs/foundry/submission.md index 1763850..18b7d74 100644 --- a/docs/foundry/submission.md +++ b/docs/foundry/submission.md @@ -57,7 +57,7 @@ for pred in submit( model_name="aurora-0.25-small-pretrained", num_steps=4, foundry_client=foundry_client, - channel=communication, + channel=channel, ): pass # Do something with `pred`. ``` From 842dbc7364b60fc26e55e76d002d52ac00d2d885 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 13:43:44 +0100 Subject: [PATCH 37/44] Let the demo depend on env vars --- docs/foundry/demo_hres_t0.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/foundry/demo_hres_t0.ipynb b/docs/foundry/demo_hres_t0.ipynb index a483f6f..1752ec7 100644 --- a/docs/foundry/demo_hres_t0.ipynb +++ b/docs/foundry/demo_hres_t0.ipynb @@ -41,6 +41,7 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", "from datetime import datetime\n", "\n", "import torch\n", @@ -49,10 +50,10 @@ "from aurora.foundry import BlobStorageChannel, FoundryClient, submit\n", "\n", "foundry_client = FoundryClient(\n", - " endpoint=\"https://endpoint/\",\n", - " token=\"TOKEN\",\n", + " endpoint=os.environ[\"FOUNDRY_ENDPOINT\"],\n", + " token=os.environ[\"FOUNDRY_TOKEN\"],\n", ")\n", - "channel = BlobStorageChannel(\"https://my.blob.core.windows.net/container/folder?\")\n", + "channel = BlobStorageChannel(os.environ[\"BLOB_URL_WITH_SAS\"])\n", "\n", "initial_condition = Batch(\n", " surf_vars={k: torch.randn(1, 2, 17, 32) for k in (\"2t\", \"10u\", \"10v\", \"msl\")},\n", From fd6ef690088734197f4403c08585021c4b4ae260 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 13:44:42 +0100 Subject: [PATCH 38/44] Fix plotting --- docs/foundry/demo_hres_t0.ipynb | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/foundry/demo_hres_t0.ipynb b/docs/foundry/demo_hres_t0.ipynb index 1752ec7..fc2cac0 100644 --- a/docs/foundry/demo_hres_t0.ipynb +++ b/docs/foundry/demo_hres_t0.ipynb @@ -87,15 +87,16 @@ "source": [ "import matplotlib.pyplot as plt\n", "\n", - "fig, ax = plt.subplots(2, 2, figsize=(12, 6.5))\n", + "fig, axs = plt.subplots(2, 2, figsize=(12, 6.5))\n", "\n", - "for i in range(ax.shape[0]):\n", + "for i in range(4):\n", " pred = predictions[i]\n", "\n", - " ax[i, 0].imshow(pred.surf_vars[\"2t\"][0, 0].numpy() - 273.15, vmin=-50, vmax=50)\n", - " ax[i, 0].set_title(str(pred.metadata.time[0]))\n", - " ax[i, 0].set_xticks([])\n", - " ax[i, 0].set_yticks([])\n", + " ax = axs[i // 2, i % 2]\n", + " ax.imshow(pred.surf_vars[\"2t\"][0, 0].numpy() - 273.15, vmin=-50, vmax=50)\n", + " ax.set_title(str(pred.metadata.time[0]))\n", + " ax.set_xticks([])\n", + " ax.set_yticks([])\n", "\n", "plt.tight_layout()" ] From 6de5ca991457a2794dc32191641f5537fef594c3 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 13:45:46 +0100 Subject: [PATCH 39/44] Remove random data --- docs/foundry/demo_hres_t0.ipynb | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/docs/foundry/demo_hres_t0.ipynb b/docs/foundry/demo_hres_t0.ipynb index fc2cac0..2edc40a 100644 --- a/docs/foundry/demo_hres_t0.ipynb +++ b/docs/foundry/demo_hres_t0.ipynb @@ -42,11 +42,7 @@ "outputs": [], "source": [ "import os\n", - "from datetime import datetime\n", "\n", - "import torch\n", - "\n", - "from aurora import Batch, Metadata\n", "from aurora.foundry import BlobStorageChannel, FoundryClient, submit\n", "\n", "foundry_client = FoundryClient(\n", @@ -55,18 +51,6 @@ ")\n", "channel = BlobStorageChannel(os.environ[\"BLOB_URL_WITH_SAS\"])\n", "\n", - "initial_condition = Batch(\n", - " surf_vars={k: torch.randn(1, 2, 17, 32) for k in (\"2t\", \"10u\", \"10v\", \"msl\")},\n", - " static_vars={k: torch.randn(17, 32) for k in (\"lsm\", \"z\", \"slt\")},\n", - " atmos_vars={k: torch.randn(1, 2, 4, 17, 32) for k in (\"z\", \"u\", \"v\", \"t\", \"q\")},\n", - " metadata=Metadata(\n", - " lat=torch.linspace(90, -90, 17),\n", - " lon=torch.linspace(0, 360, 32 + 1)[:-1],\n", - " time=(datetime(2020, 6, 1, 12, 0),),\n", - " atmos_levels=(100, 250, 500, 850),\n", - " ),\n", - ")\n", - "\n", "predictions = list(\n", " submit(\n", " initial_condition,\n", From 22e8fd361418ac7249f9b4bb510f5953aba4d86c Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 14:13:40 +0100 Subject: [PATCH 40/44] Increment version --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cffd08d..f5cec3a 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: install test docs docker-requirements docker swagger-file DOCKER_WS ?= testwsacr -DOCKER_IMAGE ?= aurora-foundry:20250110-1 +DOCKER_IMAGE ?= aurora-foundry:20250110-2 install: pip install --upgrade pip From cdb0e99e9bb8f132e2c9e38157e8cbaa04a4eb09 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 15:25:32 +0100 Subject: [PATCH 41/44] Run demo --- aurora/foundry/client/api.py | 4 +- aurora/foundry/demo/hres_t0_data.py | 6 +-- docs/foundry/demo_hres_t0.ipynb | 79 ++++++++++++++++++++++++----- 3 files changed, 71 insertions(+), 18 deletions(-) diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index 0e33d24..cc28924 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -75,9 +75,10 @@ def submit( logger.info(f"Created task `{task_id}` at endpoint.") # Send the initial condition over. + logger.info("Uploading initial condition.") channel.send(batch, task_id, "input.nc") - previous_status: str = "No Status" + previous_status: str = "No status" previous_progress: int = 0 while True: @@ -112,3 +113,4 @@ def submit( logger.info("Retrieving predictions.") for prediction_name in iterate_prediction_files("prediction.nc", num_steps): yield channel.receive(task_id, prediction_name) + logger.info("All predictions have been retrieved.") diff --git a/aurora/foundry/demo/hres_t0_data.py b/aurora/foundry/demo/hres_t0_data.py index b70238a..f7600df 100644 --- a/aurora/foundry/demo/hres_t0_data.py +++ b/aurora/foundry/demo/hres_t0_data.py @@ -115,9 +115,9 @@ def _prepare(x: np.ndarray) -> torch.Tensor: # The static variables are constant, so we just get them for the first time. They # don't need to be flipped along the latitude dimension, because they are from # ERA5. - "z": torch.from_numpy(static_vars_ds["z"].values[0]), - "slt": torch.from_numpy(static_vars_ds["slt"].values[0]), - "lsm": torch.from_numpy(static_vars_ds["lsm"].values[0]), + "z": torch.from_numpy(static_vars_ds["z"].values), + "slt": torch.from_numpy(static_vars_ds["slt"].values), + "lsm": torch.from_numpy(static_vars_ds["lsm"].values), }, atmos_vars={ "t": _prepare(atmos_vars_ds["temperature"].values), diff --git a/docs/foundry/demo_hres_t0.ipynb b/docs/foundry/demo_hres_t0.ipynb index 2edc40a..b442a7b 100644 --- a/docs/foundry/demo_hres_t0.ipynb +++ b/docs/foundry/demo_hres_t0.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "bba46076-2b4f-452f-b694-ee108eae839d", "metadata": {}, "outputs": [ @@ -21,10 +21,7 @@ "/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: '/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/torchvision/image.so: undefined symbol: _ZN3c1017RegisterOperatorsD1Ev'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n", - " warn(\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "torchdata 0.6.1 requires torch==2.0.1, but you have torch 2.2.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" + " warn(\n" ] } ], @@ -36,15 +33,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "07fd311b-54c0-4144-beb6-4a81d2056507", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-10 15:21:47,506 [INFO] Created task `d9100e57-15a4-49fe-8806-c5d67bc3870d` at endpoint.\n", + "2025-01-10 15:21:47,508 [INFO] Uploading initial condition.\n", + "2025-01-10 15:23:16,805 [INFO] Task status update: Queued\n", + "2025-01-10 15:23:23,864 [INFO] Task status update: Running\n", + "2025-01-10 15:23:52,220 [INFO] Task progress update: 25%.\n", + "2025-01-10 15:24:06,459 [INFO] Task progress update: 50%.\n", + "2025-01-10 15:24:13,532 [INFO] Task progress update: 75%.\n", + "2025-01-10 15:24:24,664 [INFO] Task status update: Successfully completed\n", + "2025-01-10 15:24:24,664 [INFO] Task progress update: 100%.\n", + "2025-01-10 15:24:24,665 [INFO] Task has been successfully completed!\n", + "2025-01-10 15:24:24,666 [INFO] Retrieving predictions.\n", + "2025-01-10 15:25:11,602 [INFO] All predictions have been retrieved.\n" + ] + } + ], "source": [ + "import logging\n", "import os\n", "\n", "from aurora.foundry import BlobStorageChannel, FoundryClient, submit\n", "\n", + "# Show what's happening!\n", + "logging.basicConfig(\n", + " level=logging.INFO,\n", + " format=\"%(asctime)s [%(levelname)s] %(message)s\",\n", + " handlers=[logging.StreamHandler()],\n", + " force=True,\n", + ")\n", + "\n", "foundry_client = FoundryClient(\n", " endpoint=os.environ[\"FOUNDRY_ENDPOINT\"],\n", " token=os.environ[\"FOUNDRY_TOKEN\"],\n", @@ -64,25 +89,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "5ea3d81c-6932-46f6-a91b-6fc79d549b29", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ + "from datetime import timedelta\n", + "\n", "import matplotlib.pyplot as plt\n", "\n", - "fig, axs = plt.subplots(2, 2, figsize=(12, 6.5))\n", + "fig, axs = plt.subplots(3, 2, figsize=(10, 8))\n", + "\n", + "ax = axs[0, 0]\n", + "ax.imshow(initial_condition.surf_vars[\"2t\"][0, 0].numpy() - 273.15, vmin=-50, vmax=50)\n", + "ax.set_title(f\"2T at {initial_condition.metadata.time[0] - timedelta(hours=6)} (observed)\")\n", + "ax.set_xticks([])\n", + "ax.set_yticks([])\n", + "\n", + "ax = axs[0, 1]\n", + "ax.imshow(initial_condition.surf_vars[\"2t\"][0, 1].numpy() - 273.15, vmin=-50, vmax=50)\n", + "ax.set_title(f\"2T at {initial_condition.metadata.time[0]} (observed)\")\n", + "ax.set_xticks([])\n", + "ax.set_yticks([])\n", "\n", - "for i in range(4):\n", - " pred = predictions[i]\n", + "for i in range(2, 6):\n", + " pred = predictions[i - 2]\n", "\n", " ax = axs[i // 2, i % 2]\n", " ax.imshow(pred.surf_vars[\"2t\"][0, 0].numpy() - 273.15, vmin=-50, vmax=50)\n", - " ax.set_title(str(pred.metadata.time[0]))\n", + " ax.set_title(f\"2T at {pred.metadata.time[0]} (prediction)\")\n", " ax.set_xticks([])\n", " ax.set_yticks([])\n", "\n", - "plt.tight_layout()" + "plt.tight_layout()\n", + "plt.subplots_adjust(hspace=0.2)" ] } ], From bbeaa2f35f7d9ccf6e6ab9eb20a298b77efbad94 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Fri, 10 Jan 2025 15:57:06 +0100 Subject: [PATCH 42/44] Silence warnings --- aurora/foundry/demo/hres_t0_data.py | 2 + docs/foundry/demo_hres_t0.ipynb | 57 +++++++++++++++-------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/aurora/foundry/demo/hres_t0_data.py b/aurora/foundry/demo/hres_t0_data.py index f7600df..0eaecfb 100644 --- a/aurora/foundry/demo/hres_t0_data.py +++ b/aurora/foundry/demo/hres_t0_data.py @@ -35,9 +35,11 @@ def load_batch(day: datetime = datetime(2022, 5, 11), cache_path: str = "~/downl def _load_batch(day: str, cache_path: Path) -> Batch: + # Install any required packages and hide the output. This can be done in a nicer way. subprocess.check_call( [sys.executable, "-m", "pip", "install", "fsspec", "gcsfs", "zarr", "matplotlib"], stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, ) cache_path = cache_path.expanduser() diff --git a/docs/foundry/demo_hres_t0.ipynb b/docs/foundry/demo_hres_t0.ipynb index b442a7b..9ee592a 100644 --- a/docs/foundry/demo_hres_t0.ipynb +++ b/docs/foundry/demo_hres_t0.ipynb @@ -11,29 +11,32 @@ { "cell_type": "code", "execution_count": 1, + "id": "b7320530-4921-4b1f-b0fc-3d6aa6949631", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\") # Silence warnings in this demo." + ] + }, + { + "cell_type": "code", + "execution_count": 2, "id": "bba46076-2b4f-452f-b694-ee108eae839d", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: '/home/wessel/miniconda3/envs/climai_global/lib/python3.10/site-packages/torchvision/image.so: undefined symbol: _ZN3c1017RegisterOperatorsD1Ev'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n", - " warn(\n" - ] - } - ], + "outputs": [], "source": [ + "from datetime import datetime\n", + "\n", "from aurora.foundry.demo.hres_t0_data import load_batch\n", "\n", - "initial_condition = load_batch()" + "initial_condition = load_batch(day=datetime(2022, 5, 11))" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "07fd311b-54c0-4144-beb6-4a81d2056507", "metadata": {}, "outputs": [ @@ -41,18 +44,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "2025-01-10 15:21:47,506 [INFO] Created task `d9100e57-15a4-49fe-8806-c5d67bc3870d` at endpoint.\n", - "2025-01-10 15:21:47,508 [INFO] Uploading initial condition.\n", - "2025-01-10 15:23:16,805 [INFO] Task status update: Queued\n", - "2025-01-10 15:23:23,864 [INFO] Task status update: Running\n", - "2025-01-10 15:23:52,220 [INFO] Task progress update: 25%.\n", - "2025-01-10 15:24:06,459 [INFO] Task progress update: 50%.\n", - "2025-01-10 15:24:13,532 [INFO] Task progress update: 75%.\n", - "2025-01-10 15:24:24,664 [INFO] Task status update: Successfully completed\n", - "2025-01-10 15:24:24,664 [INFO] Task progress update: 100%.\n", - "2025-01-10 15:24:24,665 [INFO] Task has been successfully completed!\n", - "2025-01-10 15:24:24,666 [INFO] Retrieving predictions.\n", - "2025-01-10 15:25:11,602 [INFO] All predictions have been retrieved.\n" + "2025-01-10 15:52:44,690 [INFO] Created task `405b219d-6cc5-4eca-b397-ad5f600e88da` at endpoint.\n", + "2025-01-10 15:52:44,692 [INFO] Uploading initial condition.\n", + "2025-01-10 15:54:09,879 [INFO] Task status update: Queued\n", + "2025-01-10 15:54:16,912 [INFO] Task status update: Running\n", + "2025-01-10 15:54:45,086 [INFO] Task progress update: 25%.\n", + "2025-01-10 15:54:59,139 [INFO] Task progress update: 50%.\n", + "2025-01-10 15:55:13,204 [INFO] Task progress update: 75%.\n", + "2025-01-10 15:55:20,279 [INFO] Task progress update: 100%.\n", + "2025-01-10 15:55:24,403 [INFO] Task status update: Successfully completed\n", + "2025-01-10 15:55:24,404 [INFO] Task has been successfully completed!\n", + "2025-01-10 15:55:24,404 [INFO] Retrieving predictions.\n", + "2025-01-10 15:56:21,104 [INFO] All predictions have been retrieved.\n" ] } ], @@ -89,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "5ea3d81c-6932-46f6-a91b-6fc79d549b29", "metadata": {}, "outputs": [ From 690e0a4564bbf0d2f7df7d853fd96e74fac53264 Mon Sep 17 00:00:00 2001 From: Wessel Bruinsma Date: Sat, 11 Jan 2025 15:56:56 +0100 Subject: [PATCH 43/44] Read the acknowledgement only once --- aurora/foundry/client/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py index cc28924..deb336b 100644 --- a/aurora/foundry/client/api.py +++ b/aurora/foundry/client/api.py @@ -80,6 +80,7 @@ def submit( previous_status: str = "No status" previous_progress: int = 0 + ack_read: bool = False while True: # Check on the progress of the task. The first progress check will trigger the task to be @@ -87,11 +88,12 @@ def submit( response = foundry_client.get_progress(task_id) task_info = TaskInfo(**response) - if task_info.submitted: + if task_info.submitted and not ack_read: # If the task has been submitted, we must be able to read the acknowledgement of the # initial condition. try: channel.read(task_id, "input.nc.ack", timeout=120) + ack_read = True # Read the acknowledgement only once. except TimeoutError as e: raise SubmissionError("Could not read acknowledgement of initial condition.") from e From 893fb196cb1e2aad9aed43927de5ec4960c7a283 Mon Sep 17 00:00:00 2001 From: Hannes Schulz Date: Sat, 11 Jan 2025 21:20:22 +0100 Subject: [PATCH 44/44] enable aurora logger --- aurora/foundry/server/score.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aurora/foundry/server/score.py b/aurora/foundry/server/score.py index af47d91..ecd9ff5 100644 --- a/aurora/foundry/server/score.py +++ b/aurora/foundry/server/score.py @@ -119,6 +119,8 @@ def __call__(self) -> None: def init() -> None: """Initialise. Do not load the model here, because which model we need depends on the submission.""" + logging.getLogger("aurora").setLevel(logging.INFO) + logger.info("Starting ThreadPoolExecutor") POOL.__enter__()