diff --git a/.github/workflows/test-examples.yml b/.github/workflows/test-examples.yml index 5c712a8..c071934 100644 --- a/.github/workflows/test-examples.yml +++ b/.github/workflows/test-examples.yml @@ -35,6 +35,7 @@ jobs: - integrations/model-training/pytorch/notebooks/Histogram_Logging_Pytorch.ipynb - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_keras.ipynb + - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_xgboost.ipynb - integrations/model-training/tensorflow/notebooks/Comet_and_Tensorflow.ipynb - integrations/model-training/transformers/notebooks/Comet_with_Hugging_Face_Trainer.ipynb diff --git a/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb new file mode 100644 index 0000000..fcdfde9 --- /dev/null +++ b/integrations/model-training/ray-train/notebooks/Comet_with_ray_train_pytorch_lightning.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Comet](https://www.comet.com/site/products/ml-experiment-tracking/?utm_campaign=ray_train&utm_medium=colab) is an MLOps Platform that is designed to help Data Scientists and Teams build better models faster! Comet provides tooling to track, Explain, Manage, and Monitor your models in a single place! It works with Jupyter Notebooks and Scripts and most importantly it's 100% free to get started!\n", + "\n", + "[Ray Train](https://docs.ray.io/en/latest/train/train.html) abstracts away the complexity of setting up a distributed training system.\n", + "\n", + "Instrument your runs with Comet to start managing experiments, create dataset versions and track hyperparameters for faster and easier reproducibility and collaboration.\n", + "\n", + "[Find more information about our integration with Ray Train](https://www.comet.ml/docs/v2/integrations/ml-frameworks/ray/)\n", + "\n", + "Get a preview for what's to come. Check out a completed experiment created from this notebook [here](https://www.comet.com/examples/comet-example-ray-train-keras/99d169308c854be7ac222c995a2bfa26?experiment-tab=systemMetrics).\n", + "\n", + "This example is based on the [following Ray Train Lightning example](https://docs.ray.io/en/latest/train/getting-started-pytorch-lightning.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZYchV5RWwdv5" + }, + "source": [ + "# Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DJnmqphuY2eI" + }, + "outputs": [], + "source": [ + "%pip install \"comet_ml>=3.47.1\" \"ray[air]>=2.1.0\" \"lightning\" \"torchvision\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "crOcPHobwhGL" + }, + "source": [ + "# Initialize Comet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HNQRM0U3caiY" + }, + "outputs": [], + "source": [ + "import comet_ml\n", + "import comet_ml.integration.ray\n", + "\n", + "comet_ml.login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cgqwGSwtzVWD" + }, + "source": [ + "# Import Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e-5rRYaUw5AF" + }, + "outputs": [], + "source": [ + "import os\n", + "import tempfile\n", + "\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "from torchvision.models import resnet18\n", + "from torchvision.datasets import FashionMNIST\n", + "from torchvision.transforms import ToTensor, Normalize, Compose\n", + "import lightning.pytorch as pl\n", + "\n", + "import ray.train.lightning\n", + "from ray.train.torch import TorchTrainer\n", + "from ray.train import ScalingConfig, RunConfig" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare your model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Model, Loss, Optimizer\n", + "class ImageClassifier(pl.LightningModule):\n", + " def __init__(self):\n", + " super(ImageClassifier, self).__init__()\n", + " self.model = resnet18(num_classes=10)\n", + " self.model.conv1 = torch.nn.Conv2d(\n", + " 1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False\n", + " )\n", + " self.criterion = torch.nn.CrossEntropyLoss()\n", + "\n", + " def forward(self, x):\n", + " return self.model(x)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " outputs = self.forward(x)\n", + " loss = self.criterion(outputs, y)\n", + " self.log(\"ligthning_loss\", loss, on_step=True, prog_bar=True)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " return torch.optim.Adam(self.model.parameters(), lr=0.001)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TJuThf1TxP_G" + }, + "source": [ + "# Define your distributed training function\n", + "\n", + "This function is gonna be distributed and executed on each distributed worker." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train_func(config):\n", + " from comet_ml.integration.ray import comet_worker_logger\n", + " from lightning.pytorch.loggers import CometLogger\n", + "\n", + " with comet_worker_logger(config) as experiment:\n", + " # Data\n", + " transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])\n", + " data_dir = os.path.join(tempfile.gettempdir(), \"data\")\n", + " train_data = FashionMNIST(\n", + " root=data_dir, train=True, download=True, transform=transform\n", + " )\n", + " train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)\n", + "\n", + " # Training\n", + " model = ImageClassifier()\n", + "\n", + " comet_logger = CometLogger()\n", + "\n", + " # Temporary workaround, can be removed once\n", + " # https://github.com/Lightning-AI/pytorch-lightning/pull/20275 has\n", + " # been merged and released\n", + " comet_logger._experiment = experiment\n", + "\n", + " # [1] Configure PyTorch Lightning Trainer.\n", + " trainer = pl.Trainer(\n", + " max_epochs=config[\"epochs\"],\n", + " devices=\"auto\",\n", + " accelerator=\"auto\",\n", + " strategy=ray.train.lightning.RayDDPStrategy(),\n", + " plugins=[ray.train.lightning.RayLightningEnvironment()],\n", + " callbacks=[ray.train.lightning.RayTrainReportCallback()],\n", + " logger=comet_logger,\n", + " # [1a] Optionally, disable the default checkpointing behavior\n", + " # in favor of the `RayTrainReportCallback` above.\n", + " enable_checkpointing=False,\n", + " log_every_n_steps=2,\n", + " )\n", + " trainer = ray.train.lightning.prepare_trainer(trainer)\n", + " trainer.fit(model, train_dataloaders=train_dataloader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Define the function that schedule the distributed job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train(num_workers: int = 2, use_gpu: bool = False, epochs=1):\n", + " scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)\n", + " config = {\"use_gpu\": use_gpu, \"epochs\": epochs}\n", + "\n", + " callback = comet_ml.integration.ray.CometTrainLoggerCallback(\n", + " config, project_name=\"comet-example-ray-train-pytorch-lightning\"\n", + " )\n", + "\n", + " ray_trainer = TorchTrainer(\n", + " train_func,\n", + " scaling_config=scaling_config,\n", + " train_loop_config=config,\n", + " run_config=RunConfig(callbacks=[callback]),\n", + " )\n", + " result = ray_trainer.fit()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train the model\n", + "\n", + "Ray will wait indefinitely if we request more num_workers that the available resources, the code below ensure we never request more CPU than available locally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ideal_num_workers = 2\n", + "\n", + "available_local_cpu_count = os.cpu_count() - 1\n", + "num_workers = min(ideal_num_workers, available_local_cpu_count)\n", + "\n", + "if num_workers < 1:\n", + " num_workers = 1\n", + "\n", + "train(num_workers, use_gpu=False, epochs=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "comet_ml.end()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}