Version 1.0 of MLFlow Slurm Plugin

ncsa · Oct 2, 2022 · 9040c5d · 9040c5d
commit 9040c5d
Show file tree

Hide file tree

Showing 18 changed files with 7,499 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, NCSA
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -0,0 +1,37 @@
+# MLFlow-Slurm
+Backend for executing MLFlow projects on Slurm batch system
+
+## Usage
+Install this package in the environment from which you will be submitting jobs.
+If you are submitting jobs from inside jobs, make sure you have this package 
+listed in your conda or pip environment.
+
+Just list this as your `--backend` in the job run. You should include a json 
+config file to control how the batch script is constructed:
+```shell
+mlflow run --backend slurm \
+          --backend-config slurm_config.json \
+          examples/sklearn_elasticnet_wine
+```
+
+It will generate a batch script named after the job id and submit it via the
+Slurm `sbatch` command. It will tag the run with the Slurm JobID
+
+## Configure Jobs
+You can set values in a json file to control job submission. The supported
+properties in this file are:
+
+|Config File Setting|Use|
+|-------------------|---|
+|partition          | Which Slurm partition should the job run in? |
+ |account            | What account name to run under |
+| gpus_per_node     | On GPU partitions how many GPUs to allocate per node |
+| mem               | Amount of memory to allocate to CPU jobs |
+| modules           | List of modules to load before starting job |
+| time              | Max CPU time job may run |
+| sbatch-script-file | Name of batch file to be produced. Leave blank to have service generate a script file name based on the run ID |
+
+## Development
+The slurm docker deployment is handy for testing and development. You can start
+up a slurm environment with the included docker-compose file
+
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -0,0 +1,14 @@
+version: "3.8"
+
+services:
+  slurm:
+    image: giovtorres/docker-centos7-slurm:21.08.0
+    hostname: slurmctl
+    container_name: slurmctl
+    working_dir: /mlflow-slurm
+    stdin_open: true
+    tty: true
+    environment:
+      PYTHON: "3.9"
+    volumes:
+      - ./:/mlflow-slurm
diff --git a/examples/sklearn_elasticnet_wine/MLproject b/examples/sklearn_elasticnet_wine/MLproject
@@ -0,0 +1,25 @@
+name: tutorial
+
+conda_env: conda.yaml
+# python_env: python_env.yaml
+
+entry_points:
+  download_data:
+    parameters:
+        data_dir: {type: path, default: .}
+    command: "wget http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv -O {data_dir}/wine-data.csv"
+
+  train:
+    parameters:
+      alpha: {type: float, default: 0.5}
+      l1_ratio: {type: float, default: 0.1}
+      data_dir: {type: path, default: .}
+    command: "python /u/bengal1/mlflow-slurm/examples/sklearn_elasticnet_wine/train.py {alpha} {l1_ratio} {data_dir}/wine-data.csv"
+
+  search:
+    parameters:
+        num_runs: {type: int, default: 2}
+        data_dir: {type: path, default: .}
+        train_backend_config: {type path, default: slurm_config.json}
+
+    command: "python /u/bengal1/mlflow-slurm/examples/sklearn_elasticnet_wine/search.py --num-runs {num_runs} --data-dir {data_dir} --train-backend-config {train_backend_config}"
diff --git a/examples/sklearn_elasticnet_wine/conda.yaml b/examples/sklearn_elasticnet_wine/conda.yaml
@@ -0,0 +1,13 @@
+name: tutorial
+channels:
+  - conda-forge
+dependencies:
+  - python=3.7
+  - pip
+  - pip:
+      - scikit-learn==0.23.2
+      - mlflow>=1.0
+      - pandas
+      - boto3
+      - git+https://github.com/ncsa/mlflow-slurm.git@initial
+
diff --git a/examples/sklearn_elasticnet_wine/python_env.yaml b/examples/sklearn_elasticnet_wine/python_env.yaml
@@ -0,0 +1,15 @@
+# Python version required to run the project.
+python: "3.9"
+# Dependencies required to build packages. This field is optional.
+build_dependencies:
+  - pip
+  - setuptools
+  - wheel==0.37.1
+# Dependencies required to run the project.
+dependencies:
+  - mlflow
+  - scikit-learn==1.0.2
+  - pandas
+  - boto3
+#  - git+https://github.com/ncsa/mlflow-slurm.git@initial
+
diff --git a/examples/sklearn_elasticnet_wine/search.py b/examples/sklearn_elasticnet_wine/search.py
@@ -0,0 +1,56 @@
+import os
+import click
+import numpy as np
+
+import mlflow
+from mlflow.entities import Param, RunTag
+from mlflow.tracking import MlflowClient
+
+tracking_client = mlflow.tracking.MlflowClient()
+
+
+def run_train(experiment_id, alpha, l1_ratio, data_dir, backend_config="slurm_config.json", parent_run_id=None):
+    p = mlflow.projects.run(
+        uri=os.path.dirname(os.path.realpath(__file__)),
+        entry_point="train",
+        parameters={
+            "alpha": str(alpha),
+            "l1_ratio": str(l1_ratio),
+            "data_dir": data_dir
+        },
+        experiment_id=experiment_id,
+        synchronous=False,
+        backend="slurm",
+        backend_config=backend_config
+    )
+    MlflowClient().log_batch(run_id=p.run_id, metrics=[],
+                             params=[Param("alpha", str(alpha)), Param("alpha", str(alpha))],
+                             tags=[RunTag(mlflow.utils.mlflow_tags.MLFLOW_PARENT_RUN_ID, parent_run_id)])
+
+    return p
+
+
+@click.command(help="Perform grid search over train (main entry point).")
+@click.option("--num-runs", type=click.INT, default=2, help="Maximum number of runs to evaluate.")
+@click.option("--train-backend-config", type=click.STRING, default="slurm_config.json", help="Json file for training jobs")
+@click.option("--data-dir", type=click.STRING, default=".", help="Directory for wine data")
+def run(num_runs, train_backend_config, data_dir):
+    provided_run_id = os.environ.get("MLFLOW_RUN_ID", None)
+    with mlflow.start_run(run_id=provided_run_id) as run:
+        print("Search is run_id ", run.info.run_id)
+        experiment_id = run.info.experiment_id
+        runs = [(np.random.uniform(1e-5, 1e-1), np.random.uniform(0, 1.0)) for _ in range(num_runs)]
+        jobs = []
+        for alpha, ll_ratio in runs:
+            jobs.append(run_train(
+                experiment_id,
+                alpha=alpha, l1_ratio=ll_ratio, data_dir=data_dir,
+                backend_config=train_backend_config,
+                parent_run_id=provided_run_id)
+            )
+        results = map(lambda job: job.wait(), jobs)
+        print(list(results))
+
+
+if __name__ == "__main__":
+    run()