Merge pull request #610 from fsschneider/dev

Move deadline, highlight registration form, reorganize folders
mlcommons · Jan 12, 2024 · b0ec4c7 · b0ec4c7
2 parents 91a6169 + 22ab1a7
commit b0ec4c7
Show file tree

Hide file tree

Showing 80 changed files with 83 additions and 62 deletions.
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -18,8 +18,8 @@ jobs:
     - name: Run pylint
       run: |
         pylint algorithmic_efficiency
-        pylint baselines
         pylint reference_algorithms
+        pylint prize_qualification_baselines
         pylint submission_runner.py
         pylint tests
 

diff --git a/.github/workflows/regression_tests.yml b/.github/workflows/regression_tests.yml
diff --git a/.github/workflows/regression_tests_variants.yml b/.github/workflows/regression_tests_variants.yml
@@ -44,7 +44,7 @@ jobs:
     - name: Run containerized workload
       run: |
         docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  
-        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  -d criteo1tb -f jax -s baselines/adamw/jax/submission.py -w criteo1tb_layernorm -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
+        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  -d criteo1tb -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w criteo1tb_layernorm -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
   criteo_resnet_jax:
     runs-on: self-hosted
     needs: build_and_push_jax_docker_image
@@ -53,7 +53,7 @@ jobs:
     - name: Run containerized workload
       run: |
         docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  
-        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  -d criteo1tb -f jax -s baselines/adamw/jax/submission.py -w criteo1tb_resnet -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
+        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}  -d criteo1tb -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w criteo1tb_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
   criteo_layernorm_pytorch:
     runs-on: self-hosted
     needs: build_and_push_pytorch_docker_image
@@ -62,7 +62,7 @@ jobs:
     - name: Run containerized workload
       run: |
         docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}
-        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s baselines/adamw/pytorch/submission.py -w criteo1tb_layernorm -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
+        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_layernorm -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
   criteo_resnet_pytorch:
     runs-on: self-hosted
     needs: build_and_push_pytorch_docker_image
@@ -71,7 +71,7 @@ jobs:
     - name: Run containerized workload
       run: |
         docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}
-        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s baselines/adamw/pytorch/submission.py -w criteo1tb_resnet -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
+        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
   criteo_resnet_pytorch:
     runs-on: self-hosted
     needs: build_and_push_pytorch_docker_image
@@ -80,6 +80,6 @@ jobs:
     - name: Run containerized workload
       run: |
         docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}
-        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s baselines/adamw/pytorch/submission.py -w criteo1tb_embed_init -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
+        docker run  -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_embed_init -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
          
  
diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md
@@ -13,8 +13,9 @@ Submissions can compete under two hyperparameter tuning rulesets (with separate
 
 ## Dates
 
-- **Call for submissions: November 28th, 2023**
-- Registration deadline to express non-binding intent to submit: January 28th, 2024
+- Call for submissions: November 28th, 2023
+- **Registration deadline to express non-binding intent to submit: February 28th, 2024**.\
+Please fill out the (mandatory but non-binding) [**registration form**](https://forms.gle/K7ty8MaYdi2AxJ4N8).
 - **Submission deadline: March 28th, 2024**
 - **Deadline for self-reporting preliminary results: May 28th, 2024**
 - [tentative] Announcement of all results: July 15th, 2024

diff --git a/COMPETITION_RULES.md b/COMPETITION_RULES.md
@@ -41,7 +41,7 @@ The Competition is open to English-speaking individuals and teams (made of indiv
 
 The Competition begins at 12:01am (ET) on November 28, 2023 and ends at 11:59pm (ET) on May 28, 2024, all according to Sponsor's time clock, which decisions are final (the "Competition Period"). There are several deadlines contained within the Competition Period:
 
-- **Intention to Submit.** You must register your Intention to Submit no later than 11:59pm ET on January 28, 2024.
+- **Intention to Submit.** You must register your Intention to Submit no later than 11:59pm ET on February 28, 2024.
 - **Submission Period.** You must complete your Submission and enter it after the Intention to Submit deadline, but no later than 11:59pm ET on March 28, 2024.
 - **Deadline for self-reporting results.** 11:59pm ET on May 28, 2024.
 
@@ -79,7 +79,7 @@ Submissions must use specific versions of PyTorch and JAX, provided by Sponsor.
 
 ## Scoring
 
-All otherwise qualified Submissions shall be scored. Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload, using measuring techniques designed to give all Submissions equal parity. In the event that no Submission in a ruleset receives a score exceeding that of both [prize qualification baselines](./reference_algorithms/prize_qualification_baselines/README.md), no prizes will be awarded for this ruleset. The Teams with the highest scores will be determined to be winners ("Selected Teams"). In the event of a tie the prize money will be split equally between the winners.
+All otherwise qualified Submissions shall be scored. Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload, using measuring techniques designed to give all Submissions equal parity. In the event that no Submission in a ruleset receives a score exceeding that of both [prize qualification baselines](./prize_qualification_baselines/README.md), no prizes will be awarded for this ruleset. The Teams with the highest scores will be determined to be winners ("Selected Teams"). In the event of a tie the prize money will be split equally between the winners.
 
 ## Submissions
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -228,7 +228,7 @@ To run the below commands, use the versions installed via `pip install -e '.[dev
 To automatically fix formatting errors, run the following (*WARNING:* this will edit your code, so it is suggested to make a git commit first!):
 
 ```bash
-yapf -i -r -vv -p algorithmic_efficiency baselines datasets reference_algorithms tests *.py
+yapf -i -r -vv -p algorithmic_efficiency datasets prize_qualification_baselines reference_algorithms tests *.py
 ```
 
 To sort all import orderings, run the following:
@@ -247,8 +247,8 @@ To print out all offending pylint issues, run the following:
 
 ```bash
 pylint algorithmic_efficiency
-pylint baselines
 pylint datasets
+pylint prize_qualification_baselines
 pylint reference_algorithms
 pylint submission_runner.py
 pylint tests

diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md
@@ -38,10 +38,9 @@
     - [How can I know if my code can be run on benchmarking hardware?](#how-can-i-know-if-my-code-can-be-run-on-benchmarking-hardware)
     - [Are we allowed to use our own hardware to self-report the results?](#are-we-allowed-to-use-our-own-hardware-to-self-report-the-results)
     - [What can I do if running the benchmark is too expensive for me?](#what-can-i-do-if-running-the-benchmark-is-too-expensive-for-me)
-    - [Can I submit existing (i.e. published) training algorithms as submissions?](#can-i-submit-previously-published-training-algorithms-as-submissions)
+    - [Can I submit previously published training algorithms as submissions?](#can-i-submit-previously-published-training-algorithms-as-submissions)
 - [Disclaimers](#disclaimers)
   - [Shared Data Pipelines between JAX and PyTorch](#shared-data-pipelines-between-jax-and-pytorch)
-  - [Pytorch Conformer CUDA OOM](#pytorch-conformer-cuda-oom)
 
 ## Introduction
 
@@ -517,7 +516,7 @@ To ensure that all submitters can develop their submissions based on the same co
 
 #### My machine only has one GPU. How can I use this repo?
 
-You can run this repo on a machine with an arbitrary number of GPUs. However, the default batch sizes in our reference algorithms `algorithmic-efficiency/baselines` and `algorithmic-efficiency/reference_algorithms` are tuned for a machine with 8 16GB V100 GPUs. You may run into OOMs if you run these algorithms with fewer than 8 GPUs. If you run into these issues because you are using a machine with less total GPU memory, please reduce the batch sizes for the submission. Note that your final submission must 'fit' on the benchmarking hardware, so if you are using fewer
+You can run this repo on a machine with an arbitrary number of GPUs. However, the default batch sizes in our reference algorithms (e.g. `algorithmic-efficiency/prize_qualification_baselines` and `algorithmic-efficiency/reference_algorithms`) are tuned for a machine with 8 16GB V100 GPUs. You may run into OOMs if you run these algorithms with fewer than 8 GPUs. If you run into these issues because you are using a machine with less total GPU memory, please reduce the batch sizes for the submission. Note that your final submission must 'fit' on the benchmarking hardware, so if you are using fewer
 GPUs with higher per GPU memory, please monitor your memory usage to make sure it will fit on 8xV100 GPUs with 16GB of VRAM per card.
 
 #### How do I run this on my SLURM cluster?
@@ -576,4 +575,3 @@ The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT
 
 Since we use PyTorch's [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) implementation, there is one Python process for each device. Depending on the hardware and the settings of the cluster, running a TensorFlow input pipeline in each Python process can lead to errors, since too many threads are created in each process. See [this PR thread](https://github.com/mlcommons/algorithmic-efficiency/pull/85) for more details.
 While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example.
-
diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
@@ -163,6 +163,7 @@ singularity build --fakeroot <singularity_image_name>.sif Singularity.def
 ```
 
 Note that this can take several minutes. Then, to start a shell session with GPU support (by using the `--nv` flag), we can run
+
 ```bash
 singularity shell --bind $HOME/data:/data,$HOME/experiment_runs:/experiment_runs \
     --nv <singularity_image_name>.sif
@@ -194,7 +195,7 @@ Make a submissions subdirectory to store your submission modules e.g. `algorithm
 
 ### Coding your Submission
 
-You can find examples of sumbission modules under `algorithmic-efficiency/baselines` and `algorithmic-efficiency/reference_algorithms`. \
+You can find examples of submission modules under `algorithmic-efficiency/prize_qualification_baselines` and `algorithmic-efficiency/reference_algorithms`. \
 A submission for the external ruleset will consist of a submission module and a tuning search space definition.
 
 1. Copy the template submission module `submissions/template/submission.py` into your submissions directory e.g. in `algorithmic-efficiency/my_submissions`.
@@ -210,7 +211,7 @@ A submission for the external ruleset will consist of a submission module and a
         }
         ```
 
-        For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json).
+        For a complete example see [tuning_search_space.json](/reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json).
 
     2. Define a range of values for quasirandom sampling by specifing a `min`, `max` and `scaling` keys for the hyperparameter:
 
@@ -224,7 +225,7 @@ A submission for the external ruleset will consist of a submission module and a
         }
         ```
 
-        For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/baselines/nadamw/tuning_search_space.json).
+        For a complete example see [tuning_search_space.json](/reference_algorithms/paper_baselines/nadamw/tuning_search_space.json).
 
 ## Run your Submission
 
@@ -342,6 +343,6 @@ To produce performance profile and performance table:
 python3 scoring/score_submission.py --experiment_path=<path_to_experiment_dir> --output_dir=<output_dir>
 ```
 
-We provide the scores and performance profiles for the baseline algorithms in the "Baseline Results" section in [Benchmarking Neural Network Training Algorithms](https://arxiv.org/abs/2306.07179).
+We provide the scores and performance profiles for the [paper baseline algorithms](/reference_algorithms/paper_baselines/) in the "Baseline Results" section in [Benchmarking Neural Network Training Algorithms](https://arxiv.org/abs/2306.07179).
 
 **Good Luck!**
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@
 [![Lint](https://github.com/mlcommons/algorithmic-efficiency/actions/workflows/linting.yml/badge.svg)](https://github.com/mlcommons/algorithmic-efficiency/actions/workflows/linting.yml)
 [![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/mlcommons/algorithmic-efficiency/blob/main/LICENSE.md)
 [![Code style: yapf](https://img.shields.io/badge/code%20style-yapf-orange)](https://github.com/google/yapf)
+[![Discord](https://dcbadge.vercel.app/api/server/5FPXK7SMt6?style=flat)](https://discord.gg/5FPXK7SMt6)
 
 ---
 
@@ -27,7 +28,8 @@
 
 > [!IMPORTANT]
 > Upcoming Deadline:
-> Registration deadline to express non-binding intent to submit: **January 28th, 2024**
+> Registration deadline to express non-binding intent to submit: **February 28th, 2024**.\
+> **If you consider submitting, please fill out the** (mandatory but non-binding) [**registration form**](https://forms.gle/K7ty8MaYdi2AxJ4N8).
 
 ## Table of Contents <!-- omit from toc -->
 
@@ -42,6 +44,9 @@
 
 ## Installation
 
+> [!TIP]
+> **If you have any questions about the benchmark competition or you run into any issues, please feel free to contact us.** Either [file an issue](https://github.com/mlcommons/algorithmic-efficiency/issues), ask a question on [our Discord](https://discord.gg/5FPXK7SMt6) or [join our weekly meetings](https://mlcommons.org/en/groups/research-algorithms/).
+
 You can install this package and dependencies in a [Python virtual environment](/GETTING_STARTED.md#python-virtual-environment) or use a [Docker/Singularity/Apptainer container](/GETTING_STARTED.md#docker) (recommended).
 We recommend using a Docker container (or alternatively, a Singularity/Apptainer container) to ensure a similar environment to our scoring and testing environments.
 Both options are described in detail in the [**Getting Started**](/GETTING_STARTED.md) document.
@@ -74,8 +79,8 @@ python3 submission_runner.py \
     --workload=mnist \
     --experiment_dir=$HOME/experiments \
     --experiment_name=my_first_experiment \
-    --submission_path=baselines/adamw/jax/submission.py \
-    --tuning_search_space=baselines/adamw/tuning_search_space.json
+    --submission_path=reference_algorithms/paper_baselines/adamw/jax/submission.py \
+    --tuning_search_space=reference_algorithms/paper_baselines/adamw/tuning_search_space.json
 ```
 
 *TL;DR running a PyTorch workload:*
@@ -86,8 +91,8 @@ python3 submission_runner.py \
     --workload=mnist \
     --experiment_dir=$HOME/experiments \
     --experiment_name=my_first_experiment \
-    --submission_path=baselines/adamw/jax/submission.py \
-    --tuning_search_space=baselines/adamw/tuning_search_space.json
+    --submission_path=reference_algorithms/paper_baselines/adamw/jax/submission.py \
+    --tuning_search_space=reference_algorithms/paper_baselines/adamw/tuning_search_space.json
 ```
 
 ## Call for Submissions

diff --git a/baselines/README.md b/baselines/README.md