Skip to content

Commit

Permalink
Merge pull request #600 from mlcommons/dev
Browse files Browse the repository at this point in the history
dev -> main
  • Loading branch information
priyakasimbeg authored Dec 20, 2023
2 parents ca87833 + bfa2ba2 commit 8cbdfd1
Show file tree
Hide file tree
Showing 69 changed files with 3,032 additions and 321 deletions.
85 changes: 85 additions & 0 deletions .github/workflows/regression_tests_variants.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
name: Containerized Regression Tests for Workload Variants

on:
pull_request:
branches:
- 'main'

jobs:
build_and_push_jax_docker_image:
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- name: Build and push docker images
run: |
GIT_BRANCH=${{ github.head_ref || github.ref_name }}
FRAMEWORK=jax
IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}"
cd $HOME/algorithmic-efficiency/docker
docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH
BUILD_RETURN=$?
if [[ ${BUILD_RETURN} != 0 ]]; then exit ${BUILD_RETURN}; fi
docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME
docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME
build_and_push_pytorch_docker_image:
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- name: Build and push docker images
run: |
GIT_BRANCH=${{ github.head_ref || github.ref_name }}
FRAMEWORK=pytorch
IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}"
cd $HOME/algorithmic-efficiency/docker
docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH
BUILD_RETURN=$?
if [[ ${BUILD_RETURN} != 0 ]]; then exit ${BUILD_RETURN}; fi
docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME
docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME
criteo_layernorm_jax:
runs-on: self-hosted
needs: build_and_push_jax_docker_image
steps:
- uses: actions/checkout@v2
- name: Run containerized workload
run: |
docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}
docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d criteo1tb -f jax -s baselines/adamw/jax/submission.py -w criteo1tb_layernorm -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
criteo_resnet_jax:
runs-on: self-hosted
needs: build_and_push_jax_docker_image
steps:
- uses: actions/checkout@v2
- name: Run containerized workload
run: |
docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }}
docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d criteo1tb -f jax -s baselines/adamw/jax/submission.py -w criteo1tb_resnet -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
criteo_layernorm_pytorch:
runs-on: self-hosted
needs: build_and_push_pytorch_docker_image
steps:
- uses: actions/checkout@v2
- name: Run containerized workload
run: |
docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}
docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s baselines/adamw/pytorch/submission.py -w criteo1tb_layernorm -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
criteo_resnet_pytorch:
runs-on: self-hosted
needs: build_and_push_pytorch_docker_image
steps:
- uses: actions/checkout@v2
- name: Run containerized workload
run: |
docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}
docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s baselines/adamw/pytorch/submission.py -w criteo1tb_resnet -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
criteo_resnet_pytorch:
runs-on: self-hosted
needs: build_and_push_pytorch_docker_image
steps:
- uses: actions/checkout@v2
- name: Run containerized workload
run: |
docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }}
docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s baselines/adamw/pytorch/submission.py -w criteo1tb_embed_init -t baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false
24 changes: 12 additions & 12 deletions GETTING_STARTED.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,29 +156,29 @@ To use the Docker container as an interactive virtual environment, you can run a

### Using Singularity/Apptainer instead of Docker

Since many compute clusters don't allow the usage of Docker due to securtiy concerns and instead encourage the use of [Singularity/Apptainer](https://github.com/apptainer/apptainer) (formerly Singularity, now called Apptainer), we also provide instructions on how to build an Apptainer container based on the here provided Dockerfile.
To convert the Dockerfile into an Apptainer definition file, we will use [spython](https://github.com/singularityhub/singularity-cli):
Since many compute clusters don't allow the usage of Docker due to securtiy concerns and instead encourage the use of [Singularity/Apptainer](https://github.com/apptainer/apptainer) (formerly Singularity, now called Apptainer), we also provide an Apptainer recipe (located at `docker/Singularity.def`) that can be used to build an image by running
```bash
pip3 install spython
cd algorithmic-efficiency/docker
spython recipe Dockerfile &> Singularity.def
singularity build --fakeroot <singularity_image_name>.sif Singularity.def
```
Now we can build the Apptainer image by running
Note that this can take several minutes. Then, to start a shell session with GPU support (by using the `--nv` flag), we can run
```bash
singularity build --fakeroot <singularity_image_name>.sif Singularity.def
singularity shell --bind $HOME/data:/data,$HOME/experiment_runs:/experiment_runs \
--nv <singularity_image_name>.sif
```
To start a shell session with GPU support (by using the `--nv` flag), we can run
Note the `--bind` flag which, similarly to Docker, allows to bind specific paths on the host system and the container, as explained [here](https://docs.sylabs.io/guides/3.7/user-guide/bind_paths_and_mounts.html).
Also note that we generated `Singularity.def` automatically from the `Dockerfile` using [spython](https://github.com/singularityhub/singularity-cli), as follows:
```bash
singularity shell --nv <singularity_image_name>.sif
pip3 install spython
cd algorithmic-efficiency/docker
python scripts/singularity_converter.py -i Dockerfile -o Singularity.def
```
Similarly to Docker, Apptainer allows you to bind specific paths on the host system and the container by specifying the `--bind` flag, as explained [here](https://docs.sylabs.io/guides/3.7/user-guide/bind_paths_and_mounts.html).
Users that wish to customize their images are invited to check and modify the `Singularity.def` recipe and the `singularity_converter.py` script.
## Download the Data
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@

> *AlgoPerf* is a suite of benchmarks and competitions to measure neural network training speedups due to algorithmic improvements in both training algorithms and models. This is the repository for the *AlgoPerf: Training Algorithms benchmark* and its associated competition. It is developed by the [MLCommons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/). This repository holds the [**competition rules**](/COMPETITION_RULES.md), the [**technical documentation**](/DOCUMENTATION.md) of the benchmark, [**getting started guides**](/GETTING_STARTED.md), and the benchmark code. For a detailed description of the benchmark design, see our [**paper**](https://arxiv.org/abs/2306.07179).
---

> [!IMPORTANT]
> Upcoming Deadline:
> Registration deadline to express non-binding intent to submit: **January 28th, 2024**
## Table of Contents <!-- omit from toc -->

- [Installation](#installation)
Expand Down
111 changes: 109 additions & 2 deletions algorithmic_efficiency/workloads/criteo1tb/criteo1tb_jax/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,101 @@
import jax.numpy as jnp


class DLRMResNet(nn.Module):
"""Define a DLRMResNet model.
Parameters:
vocab_size: the size of a single unified embedding table.
mlp_bottom_dims: dimensions of dense layers of the bottom mlp.
mlp_top_dims: dimensions of dense layers of the top mlp.
num_dense_features: number of dense features as the bottom mlp input.
embed_dim: embedding dimension.
"""

vocab_size: int = 32 * 128 * 1024 # 4_194_304
num_dense_features: int = 13
mlp_bottom_dims: Sequence[int] = (256, 256, 256)
mlp_top_dims: Sequence[int] = (256, 256, 256, 256, 1)
embed_dim: int = 128
dropout_rate: float = 0.0
use_layer_norm: bool = False # Unused.
embedding_init_multiplier: float = None # Unused

@nn.compact
def __call__(self, x, train):
bot_mlp_input, cat_features = jnp.split(x, [self.num_dense_features], 1)
cat_features = jnp.asarray(cat_features, dtype=jnp.int32)

# bottom mlp
mlp_bottom_dims = self.mlp_bottom_dims

bot_mlp_input = nn.Dense(
mlp_bottom_dims[0],
kernel_init=jnn.initializers.glorot_uniform(),
bias_init=jnn.initializers.normal(stddev=1.0 / mlp_bottom_dims[0]**0.5),
)(
bot_mlp_input)
bot_mlp_input = nn.relu(bot_mlp_input)

for dense_dim in mlp_bottom_dims[1:]:
x = nn.Dense(
dense_dim,
kernel_init=jnn.initializers.glorot_uniform(),
bias_init=jnn.initializers.normal(stddev=1.0 / dense_dim**0.5),
)(
bot_mlp_input)
bot_mlp_input += nn.relu(x)

base_init_fn = jnn.initializers.uniform(scale=1.0)
# Embedding table init and lookup for a single unified table.
idx_lookup = jnp.reshape(cat_features, [-1]) % self.vocab_size

def scaled_init(key, shape, dtype=jnp.float_):
return base_init_fn(key, shape, dtype) / jnp.sqrt(self.vocab_size)

embedding_table = self.param('embedding_table',
scaled_init, [self.vocab_size, self.embed_dim])

embed_features = embedding_table[idx_lookup]
batch_size = bot_mlp_input.shape[0]
embed_features = jnp.reshape(embed_features,
(batch_size, 26 * self.embed_dim))
top_mlp_input = jnp.concatenate([bot_mlp_input, embed_features], axis=1)
mlp_input_dim = top_mlp_input.shape[1]
mlp_top_dims = self.mlp_top_dims
num_layers_top = len(mlp_top_dims)
top_mlp_input = nn.Dense(
mlp_top_dims[0],
kernel_init=jnn.initializers.normal(
stddev=jnp.sqrt(2.0 / (mlp_input_dim + mlp_top_dims[0]))),
bias_init=jnn.initializers.normal(
stddev=jnp.sqrt(1.0 / mlp_top_dims[0])))(
top_mlp_input)
top_mlp_input = nn.relu(top_mlp_input)
for layer_idx, fan_out in list(enumerate(mlp_top_dims))[1:-1]:
fan_in = mlp_top_dims[layer_idx - 1]
x = nn.Dense(
fan_out,
kernel_init=jnn.initializers.normal(
stddev=jnp.sqrt(2.0 / (fan_in + fan_out))),
bias_init=jnn.initializers.normal(
stddev=jnp.sqrt(1.0 / mlp_top_dims[layer_idx])))(
top_mlp_input)
x = nn.relu(x)
if self.dropout_rate > 0.0 and layer_idx == num_layers_top - 2:
x = nn.Dropout(rate=self.dropout_rate, deterministic=not train)(x)
top_mlp_input += x
# In the DLRM model the last layer width is always 1. We can hardcode that
# below.
logits = nn.Dense(
1,
kernel_init=jnn.initializers.normal(
stddev=jnp.sqrt(2.0 / (mlp_top_dims[-2] + 1))),
bias_init=jnn.initializers.normal(stddev=jnp.sqrt(1.0)))(
top_mlp_input)
return logits


def dot_interact(concat_features):
"""Performs feature interaction operation between dense or sparse features.
Input tensors represent dense or sparse features.
Expand Down Expand Up @@ -52,6 +147,8 @@ class DlrmSmall(nn.Module):
mlp_top_dims: Sequence[int] = (1024, 1024, 512, 256, 1)
embed_dim: int = 128
dropout_rate: float = 0.0
use_layer_norm: bool = False
embedding_init_multiplier: float = None

@nn.compact
def __call__(self, x, train):
Expand All @@ -67,6 +164,8 @@ def __call__(self, x, train):
)(
bot_mlp_input)
bot_mlp_input = nn.relu(bot_mlp_input)
if self.use_layer_norm:
bot_mlp_input = nn.LayerNorm()(bot_mlp_input)
bot_mlp_output = bot_mlp_input
batch_size = bot_mlp_output.shape[0]
feature_stack = jnp.reshape(bot_mlp_output,
Expand All @@ -75,9 +174,13 @@ def __call__(self, x, train):
# Embedding table look-up.
idx_lookup = jnp.reshape(cat_features, [-1]) % self.vocab_size

if self.embedding_init_multiplier is None:
scale = 1 / jnp.sqrt(self.vocab_size)
else:
scale = self.embedding_init_multiplier

def scaled_init(key, shape, dtype=jnp.float_):
return (jnn.initializers.uniform(scale=1.0)(key, shape, dtype) /
jnp.sqrt(self.vocab_size))
return jnn.initializers.uniform(scale=1.0)(key, shape, dtype) * scale

embedding_table = self.param('embedding_table',
scaled_init, [self.vocab_size, self.embed_dim])
Expand All @@ -86,6 +189,8 @@ def scaled_init(key, shape, dtype=jnp.float_):
embed_features = embedding_table[idx_lookup]
embed_features = jnp.reshape(embed_features,
[batch_size, -1, self.embed_dim])
if self.use_layer_norm:
embed_features = nn.LayerNorm()(embed_features)
feature_stack = jnp.concatenate([feature_stack, embed_features], axis=1)
dot_interact_output = dot_interact(concat_features=feature_stack)
top_mlp_input = jnp.concatenate([bot_mlp_output, dot_interact_output],
Expand All @@ -103,6 +208,8 @@ def scaled_init(key, shape, dtype=jnp.float_):
top_mlp_input)
if layer_idx < (num_layers_top - 1):
top_mlp_input = nn.relu(top_mlp_input)
if self.use_layer_norm:
top_mlp_input = nn.LayerNorm()(top_mlp_input)
if (self.dropout_rate is not None and self.dropout_rate > 0.0 and
layer_idx == num_layers_top - 2):
top_mlp_input = nn.Dropout(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,23 +73,31 @@ def init_model_fn(
self,
rng: spec.RandomState,
dropout_rate: Optional[float] = None,
aux_dropout_rate: Optional[float] = None) -> spec.ModelInitState:
aux_dropout_rate: Optional[float] = None,
tabulate: Optional[bool] = False,
) -> spec.ModelInitState:
"""Only dropout is used."""
del aux_dropout_rate
self._model = models.DlrmSmall(
if self.use_resnet:
model_class = models.DLRMResNet
else:
model_class = models.DlrmSmall
self._model = model_class(
vocab_size=self.vocab_size,
num_dense_features=self.num_dense_features,
mlp_bottom_dims=self.mlp_bottom_dims,
mlp_top_dims=self.mlp_top_dims,
embed_dim=self.embed_dim,
dropout_rate=dropout_rate)
dropout_rate=dropout_rate,
use_layer_norm=self.use_layer_norm,
embedding_init_multiplier=self.embedding_init_multiplier)

params_rng, dropout_rng = jax.random.split(rng)
init_fake_batch_size = 2
num_categorical_features = 26
input_size = self.num_dense_features + num_categorical_features
num_dense_features = 13
input_size = num_dense_features + num_categorical_features
input_shape = (init_fake_batch_size, input_size)

init_fn = functools.partial(self._model.init, train=False)
initial_variables = jax.jit(init_fn)(
{'params': params_rng, 'dropout': dropout_rng},
Expand Down Expand Up @@ -154,3 +162,53 @@ def _eval_batch(self,

class Criteo1TbDlrmSmallTestWorkload(Criteo1TbDlrmSmallWorkload):
vocab_size: int = 32 * 128 * 16


class Criteo1TbDlrmSmallLayerNormWorkload(Criteo1TbDlrmSmallWorkload):

@property
def use_layer_norm(self) -> bool:
"""Whether or not to use LayerNorm in the model."""
return True

@property
def validation_target_value(self) -> float:
return 0.123744

@property
def test_target_value(self) -> float:
return 0.126152


class Criteo1TbDlrmSmallResNetWorkload(Criteo1TbDlrmSmallWorkload):
mlp_bottom_dims: Tuple[int, int] = (256, 256, 256)
mlp_top_dims: Tuple[int, int, int] = (256, 256, 256, 256, 1)

@property
def use_resnet(self) -> bool:
"""Whether or not to use residual connections in the model."""
return True

@property
def validation_target_value(self) -> float:
return 0.124027

@property
def test_target_value(self) -> float:
return 0.126468


class Criteo1TbDlrmSmallEmbedInitWorkload(Criteo1TbDlrmSmallWorkload):

@property
def validation_target_value(self) -> float:
return 0.124286

@property
def test_target_value(self) -> float:
# Todo
return 0.126725

@property
def embedding_init_multiplier(self) -> float:
return 1.0
Loading

0 comments on commit 8cbdfd1

Please sign in to comment.