Skip to content

Commit

Permalink
Merge pull request #6 from FelixNahrstedt/EXPERIMENT-A100
Browse files Browse the repository at this point in the history
Experiment a100
  • Loading branch information
FelixNahrstedt authored Apr 16, 2024
2 parents bf1511a + ae6e7a3 commit 223bbcc
Show file tree
Hide file tree
Showing 87 changed files with 2,569 additions and 40 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ MANIFEST
emulator/logs
emulator/wandb
wandb/
Slurm_Scripts/

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down Expand Up @@ -154,3 +153,7 @@ dmypy.json

# Pyre type checker
.pyre/

# Checkpoints
emulator/*/checkpoints/
emulator/src/core/models/climax/pretrained_checkpoints/
76 changes: 76 additions & 0 deletions Slurm_A100/climax
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash

#SBATCH --job-name="SC6-A100-basetest-climax"

#SBATCH --nodes=1

#SBATCH --gpus-per-node=a100:1 --constraint="dgx&ampere" # specify gpu

#SBATCH --ntasks-per-node=1

#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks)

#SBATCH --mem-per-cpu=16G # memory per node (4G per cpu-core is default)

#SBATCH --time=48:00:00 # set runtime

#SBATCH -o /home/mila/f/felix-andreas.nahrstedt/Slurm/SC6-A100-basetest-climax.out # set log dir to home


EXP_NAME=$1
DL_FRAMEWORK="torch"

echo "Beginning experiment $EXP_NAME."

# 1. Load Python

module load python/3.10
export PYTHONPATH=$(pwd)

# 2. Load DL Framework

if [[ $DL_FRAMEWORK == "torch" ]]; then

module load cuda/10.0/cudnn/7.6
#module load python/3.7/cuda/11.1/cudnn/8.0/pytorch/1.8.1

fi


# 3. Create or Set Up Environment
deactivate

if [ -a env_old_emulator/bin/activate ]; then

source env_old_emulator/bin/activate
echo "activated"

else
python -m venv env_old_emulator
source env_old_emulator/bin/activate
#bash download_climateset.sh || { echo "Failed to run download_climateset.sh"; exit 1; }
bash download_climax_checkpoints.sh || { echo "Failed to run download_climax_checkpoints.sh"; exit 1; }
pip install -r requirements.txt || { echo "Failed to install requirements."; exit 1; }
pip install pytorch-lightning==1.8.3 || { echo "Failed to install pytorch-lightning old version"; exit 1; }
fi

echo $PYTHONPATH
dir
cd $(pwd)


export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors.


# 8. Run Python
export HYDRA_FULL_ERROR=1
echo "Running python test.py ..."
srun python emulator/run.py experiment=superemulator/superemulator_climax_tas+pr_run-02.yaml seed=3423


# 9. Copy output to scratch
#cp /home/mila/f/felix-andreas.nahrstedt/Slurm/SC6-A100-basetest-climax_out.out

# 10. Experiment is finished

echo "Experiment $EXP_NAME is concluded."
79 changes: 79 additions & 0 deletions Slurm_A100/climax_36
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash

#SBATCH --job-name="SC36-A100-basetest-climax"

#SBATCH --nodes=1

#SBATCH --gpus-per-node=a100:1 --constraint="dgx&ampere" # specify gpu

#SBATCH --ntasks-per-node=1

#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks)

#SBATCH --mem-per-cpu=40G # memory per node (4G per cpu-core is default)

#SBATCH --time=48:00:00 # set runtime

#SBATCH -o /home/mila/f/felix-andreas.nahrstedt/Slurm/SC36-A100-basetest-climax.out # set log dir to home


EXP_NAME=$1
DL_FRAMEWORK="torch"

echo "Beginning experiment $EXP_NAME."

# 1. Load Python

module load python/3.10
export PYTHONPATH=$(pwd)

# 2. Load DL Framework

if [[ $DL_FRAMEWORK == "torch" ]]; then

module load cuda/10.0/cudnn/7.6
#module load python/3.7/cuda/11.1/cudnn/8.0/pytorch/1.8.1

fi


# 3. Create or Set Up Environment
deactivate

if [ -a env_old_emulator/bin/activate ]; then

source env_old_emulator/bin/activate
echo "activated"

else
python -m venv env_old_emulator
source env_old_emulator/bin/activate
#bash download_climateset.sh || { echo "Failed to run download_climateset.sh"; exit 1; }
#bash download_climax_checkpoints.sh || { echo "Failed to run download_climax_checkpoints.sh"; exit 1; }
pip install -r requirements.txt || { echo "Failed to install requirements."; exit 1; }
pip install pytorch-lightning==1.8.3 || { echo "Failed to install pytorch-lightning old version"; exit 1; }
fi


echo $PYTHONPATH
dir
cd $(pwd)

export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors.
# 6. Set Flags




# 8. Run Python

echo "Running python test.py ..."
srun python emulator/run.py experiment=superemulator_36/superemulator_climax_tas+pr_run-02.yaml seed=3423


# 9. Copy output to scratch


# 10. Experiment is finished

echo "Experiment $EXP_NAME is concluded."
76 changes: 76 additions & 0 deletions Slurm_A100/climax_frozen
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash

#SBATCH --job-name="SC6-A100-basetest-climax_frozen"

#SBATCH --nodes=1

#SBATCH --gpus-per-node=a100:1 --constraint="dgx&ampere" # specify gpu

#SBATCH --ntasks-per-node=1

#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks)

#SBATCH --mem-per-cpu=16G # memory per node (4G per cpu-core is default)

#SBATCH --time=48:00:00 # set runtime

#SBATCH -o /home/mila/f/felix-andreas.nahrstedt/Slurm/SC6-A100-basetest-climax_frozen.out # set log dir to home


EXP_NAME=$1
DL_FRAMEWORK="torch"

echo "Beginning experiment $EXP_NAME."

# 1. Load Python

module load python/3.10
export PYTHONPATH=$(pwd)

# 2. Load DL Framework

if [[ $DL_FRAMEWORK == "torch" ]]; then

module load cuda/10.0/cudnn/7.6
#module load python/3.7/cuda/11.1/cudnn/8.0/pytorch/1.8.1

fi


# 3. Create or Set Up Environment
deactivate

if [ -a env_old_emulator/bin/activate ]; then

source env_old_emulator/bin/activate
echo "activated"

else
python -m venv env_old_emulator
source env_old_emulator/bin/activate
#bash download_climateset.sh || { echo "Failed to run download_climateset.sh"; exit 1; }
#bash download_climax_frozen_checkpoints.sh || { echo "Failed to run download_climax_frozen_checkpoints.sh"; exit 1; }
pip install -r requirements.txt || { echo "Failed to install requirements."; exit 1; }
pip install pytorch-lightning==1.8.3 || { echo "Failed to install pytorch-lightning old version"; exit 1; }
fi

echo $PYTHONPATH
dir
cd $(pwd)


export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors.


# 8. Run Python
export HYDRA_FULL_ERROR=1
echo "Running python test.py ..."
srun python emulator/run.py experiment=superemulator/superemulator_climax_frozen_tas+pr_run-02.yaml seed=3423


# 9. Copy output to scratch
#cp /home/mila/f/felix-andreas.nahrstedt/Slurm/SC6-A100-basetest-climax_frozen_out.out

# 10. Experiment is finished

echo "Experiment $EXP_NAME is concluded."
78 changes: 78 additions & 0 deletions Slurm_A100/climax_frozen_36
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/bash

#SBATCH --job-name="SC36-A100-basetest-climax_frozen"

#SBATCH --nodes=1

#SBATCH --gpus-per-node=a100:1 --constraint="dgx&ampere" # specify gpu

#SBATCH --ntasks-per-node=1

#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks)

#SBATCH --mem-per-cpu=40G # memory per node (4G per cpu-core is default)

#SBATCH --time=48:00:00 # set runtime

#SBATCH -o /home/mila/f/felix-andreas.nahrstedt/Slurm/SC36-A100-basetest-climax_frozen.out # set log dir to home


EXP_NAME=$1
DL_FRAMEWORK="torch"

echo "Beginning experiment $EXP_NAME."

# 1. Load Python

module load python/3.10
export PYTHONPATH=$(pwd)

# 2. Load DL Framework

if [[ $DL_FRAMEWORK == "torch" ]]; then

module load cuda/10.0/cudnn/7.6
#module load python/3.7/cuda/11.1/cudnn/8.0/pytorch/1.8.1

fi


# 3. Create or Set Up Environment
deactivate

if [ -a env_old_emulator/bin/activate ]; then

source env_old_emulator/bin/activate
echo "activated"

else
python -m venv env_old_emulator
source env_old_emulator/bin/activate
#bash download_climateset.sh || { echo "Failed to run download_climateset.sh"; exit 1; }
pip install -r requirements.txt || { echo "Failed to install requirements."; exit 1; }
pip install pytorch-lightning==1.8.3 || { echo "Failed to install pytorch-lightning old version"; exit 1; }
fi


echo $PYTHONPATH
dir
cd $(pwd)

export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors.
# 6. Set Flags




# 8. Run Python

echo "Running python test.py ..."
srun python emulator/run.py experiment=superemulator_36/superemulator_climax_frozen_tas+pr_run-02.yaml seed=3423


# 9. Copy output to scratch


# 10. Experiment is finished

echo "Experiment $EXP_NAME is concluded."
Loading

0 comments on commit 223bbcc

Please sign in to comment.