Skip to content

Commit

Permalink
Add logging to file for all workers
Browse files Browse the repository at this point in the history
  • Loading branch information
matbun committed Nov 3, 2024
1 parent 540e1c1 commit 90fb53b
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
6 changes: 4 additions & 2 deletions tests/torch/runall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@ PYTHON_VENV="../../.venv-pytorch"
# Clear SLURM logs (*.out and *.err files)
rm -rf logs_slurm
mkdir logs_slurm
rm -rf logs_torchrun
rm -rf logs_torchrun logs_mpirun logs_srun

export MNIST_PATH="/ceph/hpc/data/st2301-itwin-users/mbunino/mnist" #"/p/project1/intertwin/smalldata/mnist"

# Containers
# - itwinai_torch.sif: cmcc jlab container (OMPI v5)
# - itwinai_torch2.sif: itwinai 0.2.2.dev torch2.4 (OMPI v4.1)
# - itwinai_torch3.sif: itwinai 0.2.2.dev2 torch2.4 - force distributed (OMPI v4.1)
export CONTAINER_PATH="itwinai_torch3.sif"
# - itwinai_torch4.sif: cmcc jlab container (OMPI v4.1)
# - /ceph/hpc/data/st2301-itwin-users/mbunino/jlab_simple_reconstructed_nv_itwinai.sif: jlab container recostructed from simple (OMPI v4.1)
export CONTAINER_PATH="/ceph/hpc/data/st2301-itwin-users/mbunino/jlab_simple_reconstructed_nv_itwinai.sif"

# Disable pytest ANSI coloring
export NO_COLOR=1
Expand Down
28 changes: 21 additions & 7 deletions tests/torch/slurm.vega.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,17 @@ mpirun_launcher ()
# https://docs.sylabs.io/guides/4.1/user-guide/environment_and_metadata.html#environment-from-the-host
unset PYTHONPATH

# Create mpirun logs folder
mkdir -p "logs_mpirun/$SLURM_JOB_ID"

# https://doc.vega.izum.si/mpi/#multi-node-jobs
# "if [ $OMPI_COMM_WORLD_RANK -ne 0 ]; then exec > /dev/null 2>&1; fi; exec" redirects stdout and stderr of ranks != 0
mpirun -H "${HOSTFILE}" -np $TOTAL_PROCESSES --oversubscribe -mca pml ucx -mca btl ^uct,tcp,openib,vader --bind-to core \
# "if [ $OMPI_COMM_WORLD_RANK -ne 0 ]; then exec > "logs_mpirun/$SLURM_JOB_ID/rank.$OMPI_COMM_WORLD_RANK" 2>&1; fi; exec" redirects stdout and stderr of ranks != 0
# Logs of the main woker (rank == 0) will be incorportated into the standard SLURM out and err files
mpirun -H "${HOSTFILE}" -np $TOTAL_PROCESSES --oversubscribe -mca pml ucx -mca btl ^uct,tcp,openib,vader --bind-to core \
singularity exec --nv \
"${CONTAINER_PATH}" /bin/bash -c \
'echo "Rank: $OMPI_COMM_WORLD_RANK, lrank: $OMPI_COMM_WORLD_LOCAL_RANK, Size: $OMPI_COMM_WORLD_SIZE" && \
if [ $OMPI_COMM_WORLD_RANK -ne 0 ]; then exec > /dev/null 2>&1; fi; exec '"${1}"
'echo "Rank: $OMPI_COMM_WORLD_RANK, lrank: $OMPI_COMM_WORLD_LOCAL_RANK, Size: $OMPI_COMM_WORLD_SIZE, LD_LIBRARY_PATH=$LD_LIBRARY_PATH" && \
if [ $OMPI_COMM_WORLD_RANK -ne 0 ]; then exec > "logs_mpirun/$SLURM_JOB_ID/rank.$OMPI_COMM_WORLD_RANK" 2>&1; fi; exec '"${1}"
}

# Launch distribtued job in container with srun
Expand All @@ -136,12 +140,22 @@ srun_launcher ()
# Avoid propagating PYTHONPATH to the singularity container, as it breaks the import of packages inside the container
# https://docs.sylabs.io/guides/4.1/user-guide/environment_and_metadata.html#environment-from-the-host
unset PYTHONPATH

# Create mpirun logs folder
mkdir -p "logs_srun/$SLURM_JOB_ID"

# # Get OpenMPI installation prefixes (locally and in container)
# OMPI_CONTAINER="$(singularity exec ${CONTAINER_PATH} /bin/bash -c 'ompi_info' | grep Prefix | awk '{ print $2 }')"
# OMPI_HOST="$(ompi_info | grep Prefix | awk '{ print $2 }')"
# # If you want to explicitly mount host OpenMPI in container use --bind "${OMPI_HOST}":"${OMPI_CONTAINER}"

# "if [ $SLURM_PROCID -ne 0 ]; then exec > "logs_srun/$SLURM_JOB_ID/rank.$SLURM_PROCID" 2>&1; fi; exec" redirects stdout and stderr of ranks != 0
# Logs of the main woker (rank == 0) will be incorportated into the standard SLURM out and err files
srun --mpi=pmix_v3 --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU --ntasks=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES)) \
singularity exec --nv \
$CONTAINER_PATH /bin/bash -c \
'echo "Rank: $SLURM_PROCID" && \
if [ $SLURM_PROCID -ne 0 ]; then exec > /dev/null 2>&1; fi; exec '"${1}"
"${CONTAINER_PATH}" /bin/bash -c \
'echo "Rank: $SLURM_PROCID, LD_LIBRARY_PATH=$LD_LIBRARY_PATH" && \
if [ $SLURM_PROCID -ne 0 ]; then exec > "logs_srun/$SLURM_JOB_ID/rank.$SLURM_PROCID" 2>&1; fi; exec '"${1}"
}

# Dual echo on both stdout and stderr
Expand Down

0 comments on commit 90fb53b

Please sign in to comment.