-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from FelixNahrstedt/EXPERIMENT-A100
Experiment a100
- Loading branch information
Showing
87 changed files
with
2,569 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name="SC6-A100-basetest-climax" | ||
|
||
#SBATCH --nodes=1 | ||
|
||
#SBATCH --gpus-per-node=a100:1 --constraint="dgx&ere" # specify gpu | ||
|
||
#SBATCH --ntasks-per-node=1 | ||
|
||
#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks) | ||
|
||
#SBATCH --mem-per-cpu=16G # memory per node (4G per cpu-core is default) | ||
|
||
#SBATCH --time=48:00:00 # set runtime | ||
|
||
#SBATCH -o /home/mila/f/felix-andreas.nahrstedt/Slurm/SC6-A100-basetest-climax.out # set log dir to home | ||
|
||
|
||
EXP_NAME=$1 | ||
DL_FRAMEWORK="torch" | ||
|
||
echo "Beginning experiment $EXP_NAME." | ||
|
||
# 1. Load Python | ||
|
||
module load python/3.10 | ||
export PYTHONPATH=$(pwd) | ||
|
||
# 2. Load DL Framework | ||
|
||
if [[ $DL_FRAMEWORK == "torch" ]]; then | ||
|
||
module load cuda/10.0/cudnn/7.6 | ||
#module load python/3.7/cuda/11.1/cudnn/8.0/pytorch/1.8.1 | ||
|
||
fi | ||
|
||
|
||
# 3. Create or Set Up Environment | ||
deactivate | ||
|
||
if [ -a env_old_emulator/bin/activate ]; then | ||
|
||
source env_old_emulator/bin/activate | ||
echo "activated" | ||
|
||
else | ||
python -m venv env_old_emulator | ||
source env_old_emulator/bin/activate | ||
#bash download_climateset.sh || { echo "Failed to run download_climateset.sh"; exit 1; } | ||
bash download_climax_checkpoints.sh || { echo "Failed to run download_climax_checkpoints.sh"; exit 1; } | ||
pip install -r requirements.txt || { echo "Failed to install requirements."; exit 1; } | ||
pip install pytorch-lightning==1.8.3 || { echo "Failed to install pytorch-lightning old version"; exit 1; } | ||
fi | ||
|
||
echo $PYTHONPATH | ||
dir | ||
cd $(pwd) | ||
|
||
|
||
export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors. | ||
|
||
|
||
# 8. Run Python | ||
export HYDRA_FULL_ERROR=1 | ||
echo "Running python test.py ..." | ||
srun python emulator/run.py experiment=superemulator/superemulator_climax_tas+pr_run-02.yaml seed=3423 | ||
|
||
|
||
# 9. Copy output to scratch | ||
#cp /home/mila/f/felix-andreas.nahrstedt/Slurm/SC6-A100-basetest-climax_out.out | ||
|
||
# 10. Experiment is finished | ||
|
||
echo "Experiment $EXP_NAME is concluded." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name="SC36-A100-basetest-climax" | ||
|
||
#SBATCH --nodes=1 | ||
|
||
#SBATCH --gpus-per-node=a100:1 --constraint="dgx&ere" # specify gpu | ||
|
||
#SBATCH --ntasks-per-node=1 | ||
|
||
#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks) | ||
|
||
#SBATCH --mem-per-cpu=40G # memory per node (4G per cpu-core is default) | ||
|
||
#SBATCH --time=48:00:00 # set runtime | ||
|
||
#SBATCH -o /home/mila/f/felix-andreas.nahrstedt/Slurm/SC36-A100-basetest-climax.out # set log dir to home | ||
|
||
|
||
EXP_NAME=$1 | ||
DL_FRAMEWORK="torch" | ||
|
||
echo "Beginning experiment $EXP_NAME." | ||
|
||
# 1. Load Python | ||
|
||
module load python/3.10 | ||
export PYTHONPATH=$(pwd) | ||
|
||
# 2. Load DL Framework | ||
|
||
if [[ $DL_FRAMEWORK == "torch" ]]; then | ||
|
||
module load cuda/10.0/cudnn/7.6 | ||
#module load python/3.7/cuda/11.1/cudnn/8.0/pytorch/1.8.1 | ||
|
||
fi | ||
|
||
|
||
# 3. Create or Set Up Environment | ||
deactivate | ||
|
||
if [ -a env_old_emulator/bin/activate ]; then | ||
|
||
source env_old_emulator/bin/activate | ||
echo "activated" | ||
|
||
else | ||
python -m venv env_old_emulator | ||
source env_old_emulator/bin/activate | ||
#bash download_climateset.sh || { echo "Failed to run download_climateset.sh"; exit 1; } | ||
#bash download_climax_checkpoints.sh || { echo "Failed to run download_climax_checkpoints.sh"; exit 1; } | ||
pip install -r requirements.txt || { echo "Failed to install requirements."; exit 1; } | ||
pip install pytorch-lightning==1.8.3 || { echo "Failed to install pytorch-lightning old version"; exit 1; } | ||
fi | ||
|
||
|
||
echo $PYTHONPATH | ||
dir | ||
cd $(pwd) | ||
|
||
export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors. | ||
# 6. Set Flags | ||
|
||
|
||
|
||
|
||
# 8. Run Python | ||
|
||
echo "Running python test.py ..." | ||
srun python emulator/run.py experiment=superemulator_36/superemulator_climax_tas+pr_run-02.yaml seed=3423 | ||
|
||
|
||
# 9. Copy output to scratch | ||
|
||
|
||
# 10. Experiment is finished | ||
|
||
echo "Experiment $EXP_NAME is concluded." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name="SC6-A100-basetest-climax_frozen" | ||
|
||
#SBATCH --nodes=1 | ||
|
||
#SBATCH --gpus-per-node=a100:1 --constraint="dgx&ere" # specify gpu | ||
|
||
#SBATCH --ntasks-per-node=1 | ||
|
||
#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks) | ||
|
||
#SBATCH --mem-per-cpu=16G # memory per node (4G per cpu-core is default) | ||
|
||
#SBATCH --time=48:00:00 # set runtime | ||
|
||
#SBATCH -o /home/mila/f/felix-andreas.nahrstedt/Slurm/SC6-A100-basetest-climax_frozen.out # set log dir to home | ||
|
||
|
||
EXP_NAME=$1 | ||
DL_FRAMEWORK="torch" | ||
|
||
echo "Beginning experiment $EXP_NAME." | ||
|
||
# 1. Load Python | ||
|
||
module load python/3.10 | ||
export PYTHONPATH=$(pwd) | ||
|
||
# 2. Load DL Framework | ||
|
||
if [[ $DL_FRAMEWORK == "torch" ]]; then | ||
|
||
module load cuda/10.0/cudnn/7.6 | ||
#module load python/3.7/cuda/11.1/cudnn/8.0/pytorch/1.8.1 | ||
|
||
fi | ||
|
||
|
||
# 3. Create or Set Up Environment | ||
deactivate | ||
|
||
if [ -a env_old_emulator/bin/activate ]; then | ||
|
||
source env_old_emulator/bin/activate | ||
echo "activated" | ||
|
||
else | ||
python -m venv env_old_emulator | ||
source env_old_emulator/bin/activate | ||
#bash download_climateset.sh || { echo "Failed to run download_climateset.sh"; exit 1; } | ||
#bash download_climax_frozen_checkpoints.sh || { echo "Failed to run download_climax_frozen_checkpoints.sh"; exit 1; } | ||
pip install -r requirements.txt || { echo "Failed to install requirements."; exit 1; } | ||
pip install pytorch-lightning==1.8.3 || { echo "Failed to install pytorch-lightning old version"; exit 1; } | ||
fi | ||
|
||
echo $PYTHONPATH | ||
dir | ||
cd $(pwd) | ||
|
||
|
||
export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors. | ||
|
||
|
||
# 8. Run Python | ||
export HYDRA_FULL_ERROR=1 | ||
echo "Running python test.py ..." | ||
srun python emulator/run.py experiment=superemulator/superemulator_climax_frozen_tas+pr_run-02.yaml seed=3423 | ||
|
||
|
||
# 9. Copy output to scratch | ||
#cp /home/mila/f/felix-andreas.nahrstedt/Slurm/SC6-A100-basetest-climax_frozen_out.out | ||
|
||
# 10. Experiment is finished | ||
|
||
echo "Experiment $EXP_NAME is concluded." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name="SC36-A100-basetest-climax_frozen" | ||
|
||
#SBATCH --nodes=1 | ||
|
||
#SBATCH --gpus-per-node=a100:1 --constraint="dgx&ere" # specify gpu | ||
|
||
#SBATCH --ntasks-per-node=1 | ||
|
||
#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks) | ||
|
||
#SBATCH --mem-per-cpu=40G # memory per node (4G per cpu-core is default) | ||
|
||
#SBATCH --time=48:00:00 # set runtime | ||
|
||
#SBATCH -o /home/mila/f/felix-andreas.nahrstedt/Slurm/SC36-A100-basetest-climax_frozen.out # set log dir to home | ||
|
||
|
||
EXP_NAME=$1 | ||
DL_FRAMEWORK="torch" | ||
|
||
echo "Beginning experiment $EXP_NAME." | ||
|
||
# 1. Load Python | ||
|
||
module load python/3.10 | ||
export PYTHONPATH=$(pwd) | ||
|
||
# 2. Load DL Framework | ||
|
||
if [[ $DL_FRAMEWORK == "torch" ]]; then | ||
|
||
module load cuda/10.0/cudnn/7.6 | ||
#module load python/3.7/cuda/11.1/cudnn/8.0/pytorch/1.8.1 | ||
|
||
fi | ||
|
||
|
||
# 3. Create or Set Up Environment | ||
deactivate | ||
|
||
if [ -a env_old_emulator/bin/activate ]; then | ||
|
||
source env_old_emulator/bin/activate | ||
echo "activated" | ||
|
||
else | ||
python -m venv env_old_emulator | ||
source env_old_emulator/bin/activate | ||
#bash download_climateset.sh || { echo "Failed to run download_climateset.sh"; exit 1; } | ||
pip install -r requirements.txt || { echo "Failed to install requirements."; exit 1; } | ||
pip install pytorch-lightning==1.8.3 || { echo "Failed to install pytorch-lightning old version"; exit 1; } | ||
fi | ||
|
||
|
||
echo $PYTHONPATH | ||
dir | ||
cd $(pwd) | ||
|
||
export NCCL_BLOCKING_WAIT=1 #Pytorch Lightning uses the NCCL backend for inter-GPU communication by default. Set this variable to avoid timeout errors. | ||
# 6. Set Flags | ||
|
||
|
||
|
||
|
||
# 8. Run Python | ||
|
||
echo "Running python test.py ..." | ||
srun python emulator/run.py experiment=superemulator_36/superemulator_climax_frozen_tas+pr_run-02.yaml seed=3423 | ||
|
||
|
||
# 9. Copy output to scratch | ||
|
||
|
||
# 10. Experiment is finished | ||
|
||
echo "Experiment $EXP_NAME is concluded." |
Oops, something went wrong.