From 90fb53bfee58caf71c91974f3ee227535ccd0706 Mon Sep 17 00:00:00 2001 From: "48362942+matbun@users.noreply.github.com" Date: Sun, 3 Nov 2024 20:29:47 +0100 Subject: [PATCH] Add logging to file for all workers --- tests/torch/runall.sh | 6 ++++-- tests/torch/slurm.vega.sh | 28 +++++++++++++++++++++------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/torch/runall.sh b/tests/torch/runall.sh index 7a391312..72ee0863 100644 --- a/tests/torch/runall.sh +++ b/tests/torch/runall.sh @@ -6,7 +6,7 @@ PYTHON_VENV="../../.venv-pytorch" # Clear SLURM logs (*.out and *.err files) rm -rf logs_slurm mkdir logs_slurm -rm -rf logs_torchrun +rm -rf logs_torchrun logs_mpirun logs_srun export MNIST_PATH="/ceph/hpc/data/st2301-itwin-users/mbunino/mnist" #"/p/project1/intertwin/smalldata/mnist" @@ -14,7 +14,9 @@ export MNIST_PATH="/ceph/hpc/data/st2301-itwin-users/mbunino/mnist" #"/p/project # - itwinai_torch.sif: cmcc jlab container (OMPI v5) # - itwinai_torch2.sif: itwinai 0.2.2.dev torch2.4 (OMPI v4.1) # - itwinai_torch3.sif: itwinai 0.2.2.dev2 torch2.4 - force distributed (OMPI v4.1) -export CONTAINER_PATH="itwinai_torch3.sif" +# - itwinai_torch4.sif: cmcc jlab container (OMPI v4.1) +# - /ceph/hpc/data/st2301-itwin-users/mbunino/jlab_simple_reconstructed_nv_itwinai.sif: jlab container recostructed from simple (OMPI v4.1) +export CONTAINER_PATH="/ceph/hpc/data/st2301-itwin-users/mbunino/jlab_simple_reconstructed_nv_itwinai.sif" # Disable pytest ANSI coloring export NO_COLOR=1 diff --git a/tests/torch/slurm.vega.sh b/tests/torch/slurm.vega.sh index 64ec3663..b44da5b7 100644 --- a/tests/torch/slurm.vega.sh +++ b/tests/torch/slurm.vega.sh @@ -121,13 +121,17 @@ mpirun_launcher () # https://docs.sylabs.io/guides/4.1/user-guide/environment_and_metadata.html#environment-from-the-host unset PYTHONPATH + # Create mpirun logs folder + mkdir -p "logs_mpirun/$SLURM_JOB_ID" + # https://doc.vega.izum.si/mpi/#multi-node-jobs - # "if [ $OMPI_COMM_WORLD_RANK -ne 0 ]; then exec > /dev/null 2>&1; fi; exec" redirects stdout and stderr of ranks != 0 - mpirun -H "${HOSTFILE}" -np $TOTAL_PROCESSES --oversubscribe -mca pml ucx -mca btl ^uct,tcp,openib,vader --bind-to core \ + # "if [ $OMPI_COMM_WORLD_RANK -ne 0 ]; then exec > "logs_mpirun/$SLURM_JOB_ID/rank.$OMPI_COMM_WORLD_RANK" 2>&1; fi; exec" redirects stdout and stderr of ranks != 0 + # Logs of the main woker (rank == 0) will be incorportated into the standard SLURM out and err files + mpirun -H "${HOSTFILE}" -np $TOTAL_PROCESSES --oversubscribe -mca pml ucx -mca btl ^uct,tcp,openib,vader --bind-to core \ singularity exec --nv \ "${CONTAINER_PATH}" /bin/bash -c \ - 'echo "Rank: $OMPI_COMM_WORLD_RANK, lrank: $OMPI_COMM_WORLD_LOCAL_RANK, Size: $OMPI_COMM_WORLD_SIZE" && \ - if [ $OMPI_COMM_WORLD_RANK -ne 0 ]; then exec > /dev/null 2>&1; fi; exec '"${1}" + 'echo "Rank: $OMPI_COMM_WORLD_RANK, lrank: $OMPI_COMM_WORLD_LOCAL_RANK, Size: $OMPI_COMM_WORLD_SIZE, LD_LIBRARY_PATH=$LD_LIBRARY_PATH" && \ + if [ $OMPI_COMM_WORLD_RANK -ne 0 ]; then exec > "logs_mpirun/$SLURM_JOB_ID/rank.$OMPI_COMM_WORLD_RANK" 2>&1; fi; exec '"${1}" } # Launch distribtued job in container with srun @@ -136,12 +140,22 @@ srun_launcher () # Avoid propagating PYTHONPATH to the singularity container, as it breaks the import of packages inside the container # https://docs.sylabs.io/guides/4.1/user-guide/environment_and_metadata.html#environment-from-the-host unset PYTHONPATH + + # Create mpirun logs folder + mkdir -p "logs_srun/$SLURM_JOB_ID" + + # # Get OpenMPI installation prefixes (locally and in container) + # OMPI_CONTAINER="$(singularity exec ${CONTAINER_PATH} /bin/bash -c 'ompi_info' | grep Prefix | awk '{ print $2 }')" + # OMPI_HOST="$(ompi_info | grep Prefix | awk '{ print $2 }')" + # # If you want to explicitly mount host OpenMPI in container use --bind "${OMPI_HOST}":"${OMPI_CONTAINER}" + # "if [ $SLURM_PROCID -ne 0 ]; then exec > "logs_srun/$SLURM_JOB_ID/rank.$SLURM_PROCID" 2>&1; fi; exec" redirects stdout and stderr of ranks != 0 + # Logs of the main woker (rank == 0) will be incorportated into the standard SLURM out and err files srun --mpi=pmix_v3 --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU --ntasks=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES)) \ singularity exec --nv \ - $CONTAINER_PATH /bin/bash -c \ - 'echo "Rank: $SLURM_PROCID" && \ - if [ $SLURM_PROCID -ne 0 ]; then exec > /dev/null 2>&1; fi; exec '"${1}" + "${CONTAINER_PATH}" /bin/bash -c \ + 'echo "Rank: $SLURM_PROCID, LD_LIBRARY_PATH=$LD_LIBRARY_PATH" && \ + if [ $SLURM_PROCID -ne 0 ]; then exec > "logs_srun/$SLURM_JOB_ID/rank.$SLURM_PROCID" 2>&1; fi; exec '"${1}" } # Dual echo on both stdout and stderr