Skip to content

Commit

Permalink
Refactor scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
matbun committed Oct 31, 2024
1 parent b4321fd commit f7dc775
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 21 deletions.
6 changes: 3 additions & 3 deletions tests/torch/runall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ rm -rf logs_torchrun
export MNIST_PATH="/ceph/hpc/data/st2301-itwin-users/mbunino/mnist" #"/p/project1/intertwin/smalldata/mnist"

# Containers
# - itwinai_torch.sif: cmcc jlab container
# - itwinai_torch2.sif: itwinai 0.2.2.dev torch2.4
# - itwinai_torch3.sif: itwinai 0.2.2.dev2 torch2.4 - force distributed
# - itwinai_torch.sif: cmcc jlab container (OMPI v5)
# - itwinai_torch2.sif: itwinai 0.2.2.dev torch2.4 (OMPI v4.1)
# - itwinai_torch3.sif: itwinai 0.2.2.dev2 torch2.4 - force distributed (OMPI v4.1)
export CONTAINER_PATH="itwinai_torch3.sif"

# Disable pytest ANSI coloring
Expand Down
53 changes: 35 additions & 18 deletions tests/torch/slurm.vega.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,11 @@ if [ "$SLURM_CPUS_PER_GPU" -gt 0 ] ; then
export OMP_NUM_THREADS=$SLURM_CPUS_PER_GPU
fi

# Env vairables check
if [ -z "$DIST_MODE" ]; then
>&2 echo "ERROR: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'"
exit 1
fi
if [ -z "$RUN_NAME" ]; then
>&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment."
RUN_NAME=$DIST_MODE
fi
if [ -z "$COMMAND" ]; then
>&2 echo "ERROR: env variable COMMAND is not set. It's the python command to execute."
exit 1
fi
if [ -z "$CONTAINER_PATH" ]; then
>&2 echo "WARNING: env variable CONTAINER_PATH is not set. It's the path to a singularity container."
exit 1
fi

# Launch distributed job in container with torchrun
torchrun_launcher ()
{
# Avoid propagating PYTHONPATH to the singularity container, as it breaks the import of packages inside the container
# https://docs.sylabs.io/guides/4.1/user-guide/environment_and_metadata.html#environment-from-the-host
unset PYTHONPATH

# --no-python is needed when running commands which are not python scripts (e.g., pytest, itwinai)
Expand Down Expand Up @@ -134,6 +118,7 @@ mpirun_launcher ()
# # If you want to explicitly mount host OpenMPI in container use --bind "${OMPI_HOST}":"${OMPI_CONTAINER}"

# Avoid propagating PYTHONPATH to the singularity container, as it breaks the import of packages inside the container
# https://docs.sylabs.io/guides/4.1/user-guide/environment_and_metadata.html#environment-from-the-host
unset PYTHONPATH

# https://doc.vega.izum.si/mpi/#multi-node-jobs
Expand All @@ -149,6 +134,7 @@ mpirun_launcher ()
srun_launcher ()
{
# Avoid propagating PYTHONPATH to the singularity container, as it breaks the import of packages inside the container
# https://docs.sylabs.io/guides/4.1/user-guide/environment_and_metadata.html#environment-from-the-host
unset PYTHONPATH

srun --mpi=pmix_v3 --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU --ntasks=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES)) \
Expand All @@ -165,6 +151,37 @@ decho ()
>&2 echo "$@"
}


###################### Initial checks ######################

# Env vairables check
if [ -z "$DIST_MODE" ]; then
>&2 echo "ERROR: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'"
exit 1
fi
if [ -z "$RUN_NAME" ]; then
>&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment."
RUN_NAME=$DIST_MODE
fi
if [ -z "$COMMAND" ]; then
>&2 echo "ERROR: env variable COMMAND is not set. It's the python command to execute."
exit 1
fi
if [ -z "$CONTAINER_PATH" ]; then
>&2 echo "WARNING: env variable CONTAINER_PATH is not set. It's the path to a singularity container."
exit 1
fi

# OpenMPI version
HOST_OMPI_V="$(ompi_info --parsable | grep ompi:version:full: | cut -d':' -f4 | cut -d'.' -f1,2)"
CONTAINER_OMPI_V="$(singularity exec $CONTAINER_PATH ompi_info --parsable | grep ompi:version:full: | cut -d':' -f4 | cut -d'.' -f1,2)"

if [ "$HOST_OMPI_V" != "$CONTAINER_OMPI_V" ]; then
>&2 echo "ERROR: Host OpenMPI minor version ($HOST_OMPI_V) does not match with container's OpenMPI minor version ($CONTAINER_OMPI_V). This may cause problems."
exit 1
fi
echo -e "\nHost and container's OpenMPI minor versions match: ($HOST_OMPI_V) - ($CONTAINER_OMPI_V)\n"

# Get GPUs info per node
srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"'

Expand Down

0 comments on commit f7dc775

Please sign in to comment.