diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c10e9d0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +data/* +lightning_logs/* +runs/* +MNIST/* +cifar-10-* diff --git a/README.md b/README.md index 7d67f86..1c754d2 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,95 @@ Simple MNIST classifier written in PyTorch, PyTorch Lightning, and Keras. -## Install Dependencies +# Install Dependencies ```bash +conda create --yes --name mnist python=3.8 +conda activate mnist pip install -r requirements.txt +pip install lightning-grid +grid login ``` -## PyTorch / Lightning +# PyTorch + +Use the CLI commands below or click +[![PyTorch](https://img.shields.io/badge/rid_AI-run-78FF96.svg?labelColor=black&logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iNDgiIGhlaWdodD0iNDgiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PHBhdGggZD0iTTEgMTR2MjBhMTQgMTQgMCAwMDE0IDE0aDlWMzYuOEgxMi42VjExaDIyLjV2N2gxMS4yVjE0QTE0IDE0IDAgMDAzMi40IDBIMTVBMTQgMTQgMCAwMDEgMTR6IiBmaWxsPSIjZmZmIi8+PHBhdGggZD0iTTM1LjIgNDhoMTEuMlYyNS41SDIzLjl2MTEuM2gxMS4zVjQ4eiIgZmlsbD0iI2ZmZiIvPjwvc3ZnPg==)]( +https://platform.grid.ai/#/runs?script=https://github.com/robert-s-lee/hello_mnists/blob/6cbeebc74035cc802ddcacb852e8c284e243e4cf/pytorch.py&cloud=grid&instance=t2.medium&accelerators=1&disk_size=200&framework=lightning +) ```bash -# pytorch +# Local Run python pytorch.py -# lightning +# Grid.ai Run +grid run pytorch.py | tee /tmp/grid.run.log +``` + +# PyTorch Lightning + +Use the CLI commands below or click +[![Lightning](https://img.shields.io/badge/rid_AI-run-78FF96.svg?labelColor=black&logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iNDgiIGhlaWdodD0iNDgiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PHBhdGggZD0iTTEgMTR2MjBhMTQgMTQgMCAwMDE0IDE0aDlWMzYuOEgxMi42VjExaDIyLjV2N2gxMS4yVjE0QTE0IDE0IDAgMDAzMi40IDBIMTVBMTQgMTQgMCAwMDEgMTR6IiBmaWxsPSIjZmZmIi8+PHBhdGggZD0iTTM1LjIgNDhoMTEuMlYyNS41SDIzLjl2MTEuM2gxMS4zVjQ4eiIgZmlsbD0iI2ZmZiIvPjwvc3ZnPg==)]( +https://platform.grid.ai/#/runs?script=https://github.com/robert-s-lee/hello_mnists/blob/6cbeebc74035cc802ddcacb852e8c284e243e4cf/pl_mnist.py&cloud=grid&instance=t2.medium&accelerators=1&disk_size=200&framework=lightning +) + +```bash +# Local Run python pl_mnist.py + +# Grid.ai Run +grid run pl_mnist.py | tee /tmp/grid.run.log ``` -## Keras +# Keras + +Use the CLI commands below or click +[![Keras](https://img.shields.io/badge/rid_AI-run-78FF96.svg?labelColor=black&logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iNDgiIGhlaWdodD0iNDgiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PHBhdGggZD0iTTEgMTR2MjBhMTQgMTQgMCAwMDE0IDE0aDlWMzYuOEgxMi42VjExaDIyLjV2N2gxMS4yVjE0QTE0IDE0IDAgMDAzMi40IDBIMTVBMTQgMTQgMCAwMDEgMTR6IiBmaWxsPSIjZmZmIi8+PHBhdGggZD0iTTM1LjIgNDhoMTEuMlYyNS41SDIzLjl2MTEuM2gxMS4zVjQ4eiIgZmlsbD0iI2ZmZiIvPjwvc3ZnPg==)]( +https://platform.grid.ai/#/runs?script=https://github.com/robert-s-lee/hello_mnists/blob/6cbeebc74035cc802ddcacb852e8c284e243e4cf/keras.py&cloud=grid&instance=t2.medium&accelerators=1&disk_size=200&framework=lightning&script_args=keras.py +) ```bash +# Local Run python keras.py + +# Grid.ai Run +grid run keras.py | tee /tmp/grid.run.log +``` + +# CIFAR-10 Bonus + + +```bash +# Local Run +python pl_cifar10.py + +# Grid.ai Run +grid run pl_cifar10.py | tee /tmp/grid.run.log +``` + +# Default Command Line Argument Values per Script + +| Argument Name  | keras.py | pl_cifar10.py | pl_mnist.py | pytorch.py| +| --:| :--: | :--: | :--: | :--: | +| --max_epochs | 5 | 10 | 10 | 14| +| --lr | 1.00E-03 | 1.00E-03 | 1.00E-03 | 1 | +| --batch_size | 32 | 32 | 32 | 64 | +| --data_dir | ./data/ | os.getcwd | os.getcwd |  | +| --num_workers |   | 8 | 8 |  | +| --gpus |   | 0 | 0 |  | +| --test_batch_size |   |   |   | 1000| +| --seed |   |   |   | 1| +| --save_model |   |   |   | FALSE| +| --log_interval |   |   |   | 10| +| --gamma |   |   |   | 0.7| +| --dry_run |   |   |   | FALSE| +| --cuda |   |   |   | FALSE | + +# Grid.ai Run Monitor + +```bash +RUN_NAME=$(grep grid_name /tmp/grid.run.log | cut -d':' -f 2 | sed -e 's/^[[:space:]]*//') +watch grid status --details $RUN_NAME +grid history | grep $RUN_NAME +grid logs ${RUN_NAME}-exp0 | tee /tmp/grid.exp0.log ``` diff --git a/keras.py b/keras.py index b156a94..f6a654f 100644 --- a/keras.py +++ b/keras.py @@ -1,14 +1,15 @@ -from argparse import ArgumentParser +from configargparse import ArgumentParser from pathlib import Path from tensorflow import keras # Define this script's flags parser = ArgumentParser() -parser.add_argument('--lr', type=float, default=1e-3) -parser.add_argument('--batch_size', type=int, default=32) -parser.add_argument('--max_epochs', type=int, default=5) +parser.add_argument('--lr', type=float, default=1e-3, env_var="MNIST_LR") +parser.add_argument('--batch_size', type=int, default=32, env_var="MNIST_BATCH_SIZE") +parser.add_argument('--max_epochs', type=int, default=5, env_var="MNIST_MAX_EPOCHS") parser.add_argument('--data_dir', type=str, default="./data/") +parser.add_argument('--num_workers', type=int, default=8) args = parser.parse_args() # Make sure data_dir is absolute + create it if it doesn't exist diff --git a/pl_cifar10.py b/pl_cifar10.py index 7236114..aecec4b 100644 --- a/pl_cifar10.py +++ b/pl_cifar10.py @@ -37,18 +37,20 @@ def training_step(self, batch, batch_idx): return loss if __name__ == '__main__': - from argparse import ArgumentParser + from configargparse import ArgumentParser parser = ArgumentParser() - parser.add_argument('--gpus', type=int, default=None) - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--batch_size', type=int, default=32) - parser.add_argument('--max_epochs', type=int, default=10) + parser.add_argument('--gpus', type=int, default=0) + parser.add_argument('--lr', type=float, default=1e-3, env_var="CIFAR_LR") + parser.add_argument('--batch_size', type=int, default=32, env_var="CIFAR_BATCH_SIZE") + parser.add_argument('--max_epochs', type=int, default=10, env_var="CIFAR_MAX_EPOCHS") parser.add_argument('--data_dir', type=str, default=os.getcwd()) + parser.add_argument('--num_workers', type=int, default=8) + args = parser.parse_args() dataset = CIFAR10(args.data_dir, download=True, transform=transforms.ToTensor()) - train_loader = DataLoader(dataset, batch_size=args.batch_size) + train_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers) # init model model = LitModel(lr=args.lr) diff --git a/pl_mnist.py b/pl_mnist.py index cbb54c0..9f8eba1 100644 --- a/pl_mnist.py +++ b/pl_mnist.py @@ -36,18 +36,19 @@ def training_step(self, batch, batch_idx): return loss if __name__ == '__main__': - from argparse import ArgumentParser + from configargparse import ArgumentParser parser = ArgumentParser() - parser.add_argument('--gpus', type=int, default=None) - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--batch_size', type=int, default=32) - parser.add_argument('--max_epochs', type=int, default=10) + parser.add_argument('--gpus', type=int, default=0) + parser.add_argument('--lr', type=float, default=1e-3, env_var="MNIST_LR") + parser.add_argument('--batch_size', type=int, default=32, env_var="MNIST_BATCH_SIZE") + parser.add_argument('--max_epochs', type=int, default=10, env_var="MNIST_MAX_EPOCHS") parser.add_argument('--data_dir', type=str, default=os.getcwd()) + parser.add_argument('--num_workers', type=int, default=8) args = parser.parse_args() dataset = MNIST(args.data_dir, download=True, transform=transforms.ToTensor()) - train_loader = DataLoader(dataset, batch_size=args.batch_size) + train_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers) # init model model = LitModel(lr=args.lr) diff --git a/pytorch.py b/pytorch.py index 862e56a..5aad4b2 100644 --- a/pytorch.py +++ b/pytorch.py @@ -4,7 +4,7 @@ """ from __future__ import print_function -import argparse +from configargparse import ArgumentParser import torch import torch.nn as nn import torch.nn.functional as F @@ -87,26 +87,26 @@ def test(model, device, test_loader, epoch): def main(): # Training settings - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + parser = ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument('--batch_size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)', env_var="MNIST_BATCH_SIZE") + parser.add_argument('--test_batch_size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=14, metavar='N', - help='number of epochs to train (default: 14)') + parser.add_argument('--max_epochs', type=int, default=14, metavar='N', + help='number of epochs to train (default: 14)', env_var="MNIST_MAX_EPOCHS") parser.add_argument('--lr', type=float, default=1.0, metavar='LR', - help='learning rate (default: 1.0)') + help='learning rate (default: 1.0)', env_var="MNIST_LR") parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--cuda', action='store_true', default=False, help='disables CUDA training') - parser.add_argument('--dry-run', action='store_true', default=False, + parser.add_argument('--dry_run', action='store_true', default=False, help='quickly check a single pass') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=10, metavar='N', + parser.add_argument('--log_interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') - parser.add_argument('--save-model', action='store_true', default=False, + parser.add_argument('--save_model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.cuda and torch.cuda.is_available() @@ -137,7 +137,7 @@ def main(): optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) - for epoch in range(1, args.epochs + 1): + for epoch in range(1, args.max_epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader, epoch) scheduler.step() diff --git a/requirements.txt b/requirements.txt index f66081b..23d7b2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,10 @@ -torch==1.7.1 -pytorch-lightning==1.1.2 -torchvision==0.8.2 +# tensorflow version matches Grid.ai grid run --framework tensorflow 2.2.0 +tensorflow +# lightning version matches Grid.ai grid run --framework lightning 1.2.1 +# raise ValueError("The `preds` should be probabilities, but values were detected outside of [0,1] range.") +# ValueError: The `preds` should be probabilities, but values were detected outside of [0,1] range. +# https://github.com/PyTorchLightning/lightning-bolts/issues/551 +pytorch-lightning<1.2 +torch +torchvision +configargparser