From e6b7cd0d44512a87e2c01738ab4bb4ed0a5ad8c5 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 15:53:28 -0800 Subject: [PATCH 01/44] Attempt 1 --- Code/main.py | 14 +++++++++++--- Code/training.py | 18 ++++++++++++------ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/Code/main.py b/Code/main.py index 79e7ea7..88ea65e 100644 --- a/Code/main.py +++ b/Code/main.py @@ -15,16 +15,23 @@ import os import traceback import torch +import torch.multiprocessing as mp +import torch.distributed as dist # Define Working Directories grayscale_dir = '../Dataset/Greyscale' rgb_dir = '../Dataset/RGB' # Define Universal Parameters -image_height = 400 -image_width = 600 +image_height = 4000 +image_width = 6000 batch_size = 2 +def main_worker(rank, world_size): + # Initialize the distributed environment. + dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) + main() # Call the existing main function. + dist.destroy_process_group() # Cleanup after finishing. def main(): # Initialize Dataset Object (PyTorch Tensors) @@ -150,4 +157,5 @@ def main(): if __name__ == '__main__': - main() + world_size = torch.cuda.device_count() # Number of available GPUs + mp.spawn(main_worker, args=(world_size,), nprocs=world_size, join=True) diff --git a/Code/training.py b/Code/training.py index 0639c45..960288d 100644 --- a/Code/training.py +++ b/Code/training.py @@ -11,21 +11,27 @@ # Import Necessary Libraries import torch import torch.nn as nn +from torch.nn.parallel import DistributedDataParallel as DDP +import torch.distributed as dist # Define Training Class class Trainer(): - def __init__(self, model, loss_function, optimizer=None, model_save_path=None): - # Use All Available CUDA GPUs for Training (if Available) - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - if torch.cuda.device_count() > 1: - model = nn.DataParallel(model) + def __init__(self, model, loss_function, optimizer=None, model_save_path=None, rank=None): + self.rank = rank # Rank of the current process + self.device = torch.device(f'cuda:{rank}' if torch.cuda.is_available() else 'cpu') self.model = model.to(self.device) # Define the loss function self.loss_function = loss_function # Define the optimizer self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(self.model.parameters(), lr=0.001) + # Wrap model with DDP + if torch.cuda.device_count() > 1 and rank is not None: + self.model = DDP(self.model, device_ids=[rank]) # Define the path to save the model - self.model_save_path = model_save_path + self.model_save_path = model_save_path if rank == 0 else None # Only save on master process + + def cleanup_ddp(self): + dist.destroy_process_group() def save_model(self): # Save the model From 3120b72960e664e18038853a4567c285932f36d6 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 15:58:18 -0800 Subject: [PATCH 02/44] Added Debugging Statements --- Code/training.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Code/training.py b/Code/training.py index 960288d..781245a 100644 --- a/Code/training.py +++ b/Code/training.py @@ -29,6 +29,7 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r self.model = DDP(self.model, device_ids=[rank]) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process + print(f'Process {self.rank} is using {self.device}') def cleanup_ddp(self): dist.destroy_process_group() From c61efb4e4991854c29331307f3c37fd2eef7e0b5 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 15:59:58 -0800 Subject: [PATCH 03/44] Added Shell Script to Run Code with Environment Variables --- run_code.sh | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 run_code.sh diff --git a/run_code.sh b/run_code.sh new file mode 100644 index 0000000..7dcb873 --- /dev/null +++ b/run_code.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Set environment variables +export MASTER_ADDR=localhost +export MASTER_PORT=12345 + +# Run the Python script +python Code/main.py \ No newline at end of file From a6911cd284522d892c636256c1aae4f6924110aa Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:02:25 -0800 Subject: [PATCH 04/44] Changes Shell Script to Set Env Variables --- Code/set_env_var.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 Code/set_env_var.sh diff --git a/Code/set_env_var.sh b/Code/set_env_var.sh new file mode 100644 index 0000000..b7dcf76 --- /dev/null +++ b/Code/set_env_var.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Set environment variables +export MASTER_ADDR=localhost +export MASTER_PORT=12345 \ No newline at end of file From e668d494b0419fcfc5488c9d9a45daa05ce2700e Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:09:50 -0800 Subject: [PATCH 05/44] Added 'rank' as parameter in Main --- Code/main.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Code/main.py b/Code/main.py index 88ea65e..9211402 100644 --- a/Code/main.py +++ b/Code/main.py @@ -30,10 +30,10 @@ def main_worker(rank, world_size): # Initialize the distributed environment. dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) - main() # Call the existing main function. - dist.destroy_process_group() # Cleanup after finishing. + print(f"Initialized process group for rank {rank}, world size {world_size}") + main(rank) # Call the existing main function. -def main(): +def main(rank): # Initialize Dataset Object (PyTorch Tensors) try: dataset = CustomDataset(grayscale_dir, rgb_dir, (image_height, image_width), batch_size) @@ -70,17 +70,17 @@ def main(): # Method 1 : Baseline : Mean Squared Error Loss for AutoEncoder and LSTM os.makedirs('../Models/Method1', exist_ok=True) # Creating Directory for Model Saving model_save_path_ae = '../Models/Method1/model_autoencoder_m1.pth' - trainer_autoencoder_baseline = Trainer(model_autoencoder, loss_mse, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae) + trainer_autoencoder_baseline = Trainer(model_autoencoder, loss_mse, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, rank=rank) print('Method-1 AutoEncoder Trainer Initialized.') model_save_path_lstm = '../Models/Method1/model_lstm_m1.pth' - trainer_lstm_baseline = Trainer(model_lstm, loss_mse, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm) + trainer_lstm_baseline = Trainer(model_lstm, loss_mse, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, rank=rank) print('Method-1 LSTM Trainer Initialized.') print('-'*10) # Makes Output Readable # Method 2 : Composite Loss (MSE + MaxEnt) for AutoEncoder and Mean Squared Error Loss for LSTM os.makedirs('../Models/Method2', exist_ok=True) # Creating Directory for Model Saving model_save_path_ae = '../Models/Method2/model_autoencoder_m2.pth' - trainer_autoencoder_m2 = Trainer(model=model_autoencoder, loss_function=loss_mep, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae) + trainer_autoencoder_m2 = Trainer(model=model_autoencoder, loss_function=loss_mep, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, rank=rank) print('Method-2 AutoEncoder Trainer Initialized.') print('Method-2 LSTM == Method-1 LSTM') print('-'*10) # Makes Output Readable @@ -89,7 +89,7 @@ def main(): os.makedirs('../Models/Method3', exist_ok=True) # Creating Directory for Model Saving print('Method-3 AutoEncoder == Method-1 AutoEncoder') model_save_path_lstm = '../Models/Method3/model_lstm_m3.pth' - trainer_lstm_m3 = Trainer(model_lstm, loss_ssim, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm) + trainer_lstm_m3 = Trainer(model_lstm, loss_ssim, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, rank=rank) print('Method-3 LSTM Trainer Initialized.') print('-'*10) # Makes Output Readable From 6a6045c3c1a7d6718285bcc048e98f1b08953482 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:10:09 -0800 Subject: [PATCH 06/44] Updated Model Saving for Device Rank=0 --- Code/training.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Code/training.py b/Code/training.py index 781245a..1405a95 100644 --- a/Code/training.py +++ b/Code/training.py @@ -35,8 +35,9 @@ def cleanup_ddp(self): dist.destroy_process_group() def save_model(self): - # Save the model - torch.save(self.model.state_dict(), self.model_save_path) + if self.rank == 0: + # Save the model + torch.save(self.model.state_dict(), self.model_save_path) def train_autoencoder(self, epochs, train_loader, val_loader): # Print Names of All Available GPUs (if any) to Train the Model From ba830154f9ae1cfb1d079eb67a64931460b64d65 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:10:28 -0800 Subject: [PATCH 07/44] Removed Unnecessary Script --- run_code.sh | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 run_code.sh diff --git a/run_code.sh b/run_code.sh deleted file mode 100644 index 7dcb873..0000000 --- a/run_code.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -# Set environment variables -export MASTER_ADDR=localhost -export MASTER_PORT=12345 - -# Run the Python script -python Code/main.py \ No newline at end of file From 606e297e9f76aa9c4e1741f0f0401b60f5ff49c7 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:15:19 -0800 Subject: [PATCH 08/44] Reduced File Size for Testing --- Code/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Code/main.py b/Code/main.py index 9211402..0e3cd98 100644 --- a/Code/main.py +++ b/Code/main.py @@ -23,8 +23,8 @@ rgb_dir = '../Dataset/RGB' # Define Universal Parameters -image_height = 4000 -image_width = 6000 +image_height = 400 +image_width = 600 batch_size = 2 def main_worker(rank, world_size): From adb2e18a8eff8f3f8d112cd38700b65c0bb2d899 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:18:12 -0800 Subject: [PATCH 09/44] Attempt 2 --- Code/main.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Code/main.py b/Code/main.py index 0e3cd98..91f3d0f 100644 --- a/Code/main.py +++ b/Code/main.py @@ -112,6 +112,8 @@ def main(rank): except Exception as e: print(f"Method-1 AutoEncoder Training Error : \n{e}") traceback.print_exc() + finally: + trainer_autoencoder_baseline.cleanup_ddp() print('-'*10) # Makes Output Readable try: epochs = 1 @@ -121,6 +123,8 @@ def main(rank): except Exception as e: print(f"Method-1 LSTM Training Error : \n{e}") traceback.print_exc() + finally: + trainer_lstm_baseline.cleanup_ddp() print('-'*20) # Makes Output Readable # Method-2 @@ -132,6 +136,8 @@ def main(rank): except Exception as e: print(f"Method-2 AutoEncoder Training Error : \n{e}") traceback.print_exc() + finally: + trainer_autoencoder_m2.cleanup_ddp() print('-'*10) # Makes Output Readable print("Method-2 LSTM == Method-1 LSTM, No Need To Train Again.") print('-'*20) # Makes Output Readable @@ -147,6 +153,8 @@ def main(rank): except Exception as e: print(f"Method-3 LSTM Training Error : \n{e}") traceback.print_exc() + finally: + trainer_lstm_m3.cleanup_ddp() print('-'*20) # Makes Output Readable # Method-4 From 92c7c2796788d53c2443d84edfd7a3c71e21ae84 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:25:17 -0800 Subject: [PATCH 10/44] Attempt 3 --- Code/main.py | 3 +++ Code/training.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Code/main.py b/Code/main.py index 91f3d0f..863fccf 100644 --- a/Code/main.py +++ b/Code/main.py @@ -29,6 +29,9 @@ def main_worker(rank, world_size): # Initialize the distributed environment. + torch.manual_seed(0) + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) print(f"Initialized process group for rank {rank}, world size {world_size}") main(rank) # Call the existing main function. diff --git a/Code/training.py b/Code/training.py index 1405a95..90873e3 100644 --- a/Code/training.py +++ b/Code/training.py @@ -26,13 +26,14 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(self.model.parameters(), lr=0.001) # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: - self.model = DDP(self.model, device_ids=[rank]) + self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process print(f'Process {self.rank} is using {self.device}') def cleanup_ddp(self): - dist.destroy_process_group() + if dist.is_initialized(): + dist.destroy_process_group() def save_model(self): if self.rank == 0: From 49a673f3c36c424441907b2f2a9f67cee852c054 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:50:21 -0800 Subject: [PATCH 11/44] Removed Redundant Print Statements --- Code/main.py | 136 ++++++++++++++++++++++++++++++----------------- Code/training.py | 10 ++-- 2 files changed, 92 insertions(+), 54 deletions(-) diff --git a/Code/main.py b/Code/main.py index 863fccf..a137ac2 100644 --- a/Code/main.py +++ b/Code/main.py @@ -40,32 +40,43 @@ def main(rank): # Initialize Dataset Object (PyTorch Tensors) try: dataset = CustomDataset(grayscale_dir, rgb_dir, (image_height, image_width), batch_size) - print('Importing Dataset Complete.') + if rank == 0: + print('Importing Dataset Complete.') except Exception as e: - print(f"Importing Dataset In-Complete : \n{e}") + if rank == 0: + print(f"Importing Dataset In-Complete : \n{e}") + if rank == 0: + print('-'*20) # Makes Output Readable # Import Loss Functions try: loss_mse = LossMSE() # Mean Squared Error Loss loss_mep = LossMEP(alpha=0.4) # Maximum Entropy Loss loss_ssim = SSIMLoss() # Structural Similarity Index Measure Loss - print('Importing Loss Functions Complete.') + if rank == 0: + print('Importing Loss Functions Complete.') except Exception as e: - print(f"Importing Loss Functions In-Complete : \n{e}") - print('-'*20) # Makes Output Readable + if rank == 0: + print(f"Importing Loss Functions In-Complete : \n{e}") + if rank == 0: + print('-'*20) # Makes Output Readable # Initialize AutoEncoder Model and Import Dataloader (Training, Validation) data_autoencoder_train, data_autoencoder_val = dataset.get_autoencoder_batches(val_split=0.2) - print('AutoEncoder Model Data Imported.') + if rank == 0: + print('AutoEncoder Model Data Imported.') model_autoencoder = Grey2RGBAutoEncoder() - print('AutoEncoder Model Initialized.') - print('-'*20) # Makes Output Readable + if rank == 0: + print('AutoEncoder Model Initialized.') + print('-'*20) # Makes Output Readable # Initialize LSTM Model and Import Dataloader (Training, Validation) data_lstm_train, data_lstm_val = dataset.get_lstm_batches(val_split=0.25, sequence_length=2) - print('LSTM Model Data Imported.') + if rank == 0: + print('LSTM Model Data Imported.') model_lstm = ConvLSTM(input_dim=1, hidden_dims=[1,1,1], kernel_size=(3, 3), num_layers=3, alpha=0.5) - print('LSTM Model Initialized.') - print('-'*20) # Makes Output Readable + if rank == 0: + print('LSTM Model Initialized.') + print('-'*20) # Makes Output Readable ''' Initialize Trainer Objects @@ -74,33 +85,38 @@ def main(rank): os.makedirs('../Models/Method1', exist_ok=True) # Creating Directory for Model Saving model_save_path_ae = '../Models/Method1/model_autoencoder_m1.pth' trainer_autoencoder_baseline = Trainer(model_autoencoder, loss_mse, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, rank=rank) - print('Method-1 AutoEncoder Trainer Initialized.') + if rank == 0: + print('Method-1 AutoEncoder Trainer Initialized.') model_save_path_lstm = '../Models/Method1/model_lstm_m1.pth' trainer_lstm_baseline = Trainer(model_lstm, loss_mse, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, rank=rank) - print('Method-1 LSTM Trainer Initialized.') - print('-'*10) # Makes Output Readable + if rank == 0: + print('Method-1 LSTM Trainer Initialized.') + print('-'*10) # Makes Output Readable # Method 2 : Composite Loss (MSE + MaxEnt) for AutoEncoder and Mean Squared Error Loss for LSTM os.makedirs('../Models/Method2', exist_ok=True) # Creating Directory for Model Saving model_save_path_ae = '../Models/Method2/model_autoencoder_m2.pth' trainer_autoencoder_m2 = Trainer(model=model_autoencoder, loss_function=loss_mep, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, rank=rank) - print('Method-2 AutoEncoder Trainer Initialized.') - print('Method-2 LSTM == Method-1 LSTM') - print('-'*10) # Makes Output Readable + if rank == 0: + print('Method-2 AutoEncoder Trainer Initialized.') + print('Method-2 LSTM == Method-1 LSTM') + print('-'*10) # Makes Output Readable # Method 3 : Mean Squared Error Loss for AutoEncoder and SSIM Loss for LSTM os.makedirs('../Models/Method3', exist_ok=True) # Creating Directory for Model Saving - print('Method-3 AutoEncoder == Method-1 AutoEncoder') + if rank == 0: + print('Method-3 AutoEncoder == Method-1 AutoEncoder') model_save_path_lstm = '../Models/Method3/model_lstm_m3.pth' trainer_lstm_m3 = Trainer(model_lstm, loss_ssim, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, rank=rank) - print('Method-3 LSTM Trainer Initialized.') - print('-'*10) # Makes Output Readable + if rank == 0: + print('Method-3 LSTM Trainer Initialized.') + print('-'*10) # Makes Output Readable # Method 4 : Proposed Method : Composite Loss (MSE + MaxEnt) for AutoEncoder and SSIM Loss for LSTM - print('Method-4 AutoEncoder == Method-2 AutoEncoder') - print('Method-4 LSTM == Method-3 LSTM') - - print('-'*20) # Makes Output Readable + if rank == 0: + print('Method-4 AutoEncoder == Method-2 AutoEncoder') + print('Method-4 LSTM == Method-3 LSTM') + print('-'*20) # Makes Output Readable ''' @@ -109,62 +125,82 @@ def main(rank): # Method-1 try: epochs = 1 - print('Method-1 AutoEncoder Training Start') + if rank == 0: + print('Method-1 AutoEncoder Training Start') model_autoencoder_m1 = trainer_autoencoder_baseline.train_autoencoder(epochs, data_autoencoder_train, data_autoencoder_val) - print('Method-1 AutoEncoder Training Complete.') + if rank == 0: + print('Method-1 AutoEncoder Training Complete.') except Exception as e: - print(f"Method-1 AutoEncoder Training Error : \n{e}") + if rank == 0: + print(f"Method-1 AutoEncoder Training Error : \n{e}") traceback.print_exc() finally: - trainer_autoencoder_baseline.cleanup_ddp() - print('-'*10) # Makes Output Readable + if rank == 0: + trainer_autoencoder_baseline.cleanup_ddp() + if rank == 0: + print('-'*10) # Makes Output Readable try: epochs = 1 - print('Method-1 LSTM Training Start') + if rank == 0: + print('Method-1 LSTM Training Start') model_lstm_m1 = trainer_lstm_baseline.train_lstm(epochs, data_lstm_train, data_lstm_val) - print('Method-1 LSTM Training Complete.') + if rank == 0: + print('Method-1 LSTM Training Complete.') except Exception as e: - print(f"Method-1 LSTM Training Error : \n{e}") + if rank == 0: + print(f"Method-1 LSTM Training Error : \n{e}") traceback.print_exc() finally: - trainer_lstm_baseline.cleanup_ddp() - print('-'*20) # Makes Output Readable + if rank == 0: + trainer_lstm_baseline.cleanup_ddp() + if rank == 0: + print('-'*20) # Makes Output Readable # Method-2 try: epochs = 1 - print('Method-2 AutoEncoder Training Start') + if rank == 0: + print('Method-2 AutoEncoder Training Start') model_autoencoder_m2 = trainer_autoencoder_m2.train_autoencoder(epochs, data_autoencoder_train, data_autoencoder_val) - print('Method-2 AutoEncoder Training Complete.') + if rank == 0: + print('Method-2 AutoEncoder Training Complete.') except Exception as e: - print(f"Method-2 AutoEncoder Training Error : \n{e}") + if rank == 0: + print(f"Method-2 AutoEncoder Training Error : \n{e}") traceback.print_exc() finally: trainer_autoencoder_m2.cleanup_ddp() - print('-'*10) # Makes Output Readable - print("Method-2 LSTM == Method-1 LSTM, No Need To Train Again.") - print('-'*20) # Makes Output Readable + if rank == 0: + print('-'*10) # Makes Output Readable + print("Method-2 LSTM == Method-1 LSTM, No Need To Train Again.") + print('-'*20) # Makes Output Readable # Method-3 - print("Method-3 AutoEncoder == Method-1 AutoEncoder, No Need To Train Again.") - print('-'*10) # Makes Output Readable + if rank == 0: + print("Method-3 AutoEncoder == Method-1 AutoEncoder, No Need To Train Again.") + print('-'*10) # Makes Output Readable try: epochs = 1 - print('Method-3 LSTM Training Start.') + if rank == 0: + print('Method-3 LSTM Training Start.') model_lstm_m3 = trainer_lstm_m3.train_lstm(epochs, data_lstm_train, data_lstm_val) - print('Method-3 LSTM Training Complete.') + if rank == 0: + print('Method-3 LSTM Training Complete.') except Exception as e: - print(f"Method-3 LSTM Training Error : \n{e}") + if rank == 0: + print(f"Method-3 LSTM Training Error : \n{e}") traceback.print_exc() finally: trainer_lstm_m3.cleanup_ddp() - print('-'*20) # Makes Output Readable + if rank == 0: + print('-'*20) # Makes Output Readable # Method-4 - print("Method-4 AutoEncoder == Method-2 AutoEncoder, No Need To Train Again.") - print('-'*10) # Makes Output Readable - print("Method-4 LSTM == Method-3 LSTM, No Need To Train Again.") - print('-'*20) # Makes Output Readable + if rank == 0: + print("Method-4 AutoEncoder == Method-2 AutoEncoder, No Need To Train Again.") + print('-'*10) # Makes Output Readable + print("Method-4 LSTM == Method-3 LSTM, No Need To Train Again.") + print('-'*20) # Makes Output Readable if __name__ == '__main__': diff --git a/Code/training.py b/Code/training.py index 90873e3..3f75fc2 100644 --- a/Code/training.py +++ b/Code/training.py @@ -42,7 +42,7 @@ def save_model(self): def train_autoencoder(self, epochs, train_loader, val_loader): # Print Names of All Available GPUs (if any) to Train the Model - if torch.cuda.device_count() > 0: + if torch.cuda.device_count() > 0 and self.rank == 0: gpu_names = ', '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) print("\tGPUs being used for Training : ",gpu_names) best_val_loss = float('inf') @@ -63,7 +63,8 @@ def train_autoencoder(self, epochs, train_loader, val_loader): val_loss = sum(self.loss_function(self.model(input.to(self.device)), target.to(self.device)).item() for input, target in val_loader) # Compute Total Validation Loss val_loss /= len(val_loader) # Compute Average Validation Loss # Print epochs and losses - print(f'\tAutoEncoder Epoch {epoch+1}/{epochs} --- Training Loss: {loss.item()} --- Validation Loss: {val_loss}') + if self.rank == 0: + print(f'\tAutoEncoder Epoch {epoch+1}/{epochs} --- Training Loss: {loss.item()} --- Validation Loss: {val_loss}') # If the current validation loss is lower than the best validation loss, save the model if val_loss < best_val_loss: best_val_loss = val_loss # Update the best validation loss @@ -73,7 +74,7 @@ def train_autoencoder(self, epochs, train_loader, val_loader): def train_lstm(self, epochs, train_loader, val_loader): # Print Names of All Available GPUs (if any) to Train the Model - if torch.cuda.device_count() > 0: + if torch.cuda.device_count() > 0 and self.rank == 0: gpu_names = ', '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) print("\tGPUs being used for Training : ",gpu_names) best_val_loss = float('inf') @@ -97,7 +98,8 @@ def train_lstm(self, epochs, train_loader, val_loader): val_loss += self.loss_function(output_sequence, target_sequence).item() # Accumulate loss val_loss /= len(val_loader) # Average validation loss # Print epochs and losses - print(f'\tLSTM Epoch {epoch+1}/{epochs} --- Training Loss: {loss.item()} --- Validation Loss: {val_loss}') + if self.rank == 0: + print(f'\tLSTM Epoch {epoch+1}/{epochs} --- Training Loss: {loss.item()} --- Validation Loss: {val_loss}') # Model saving based on validation loss if val_loss < best_val_loss: best_val_loss = val_loss From ff6fc090e635dc828bd70293a4b4f683004afa94 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:56:34 -0800 Subject: [PATCH 12/44] Removed Debugging Statements --- Code/main.py | 1 - Code/training.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Code/main.py b/Code/main.py index a137ac2..f12b2e3 100644 --- a/Code/main.py +++ b/Code/main.py @@ -33,7 +33,6 @@ def main_worker(rank, world_size): torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) - print(f"Initialized process group for rank {rank}, world size {world_size}") main(rank) # Call the existing main function. def main(rank): diff --git a/Code/training.py b/Code/training.py index 3f75fc2..cf35cf2 100644 --- a/Code/training.py +++ b/Code/training.py @@ -13,6 +13,7 @@ import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist +import warnings # Define Training Class class Trainer(): @@ -26,10 +27,11 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(self.model.parameters(), lr=0.001) # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: - self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) # Remove Warnings Shown because of 'find_unused_parameters=True' + self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process - print(f'Process {self.rank} is using {self.device}') def cleanup_ddp(self): if dist.is_initialized(): From d7ddb3a713e8e70ad9ce02c9ef8e614e1bf97058 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 16:59:19 -0800 Subject: [PATCH 13/44] Removed Debugging Statement --- Code/training.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Code/training.py b/Code/training.py index cf35cf2..1ea4d76 100644 --- a/Code/training.py +++ b/Code/training.py @@ -13,7 +13,6 @@ import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist -import warnings # Define Training Class class Trainer(): @@ -27,9 +26,7 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(self.model.parameters(), lr=0.001) # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) # Remove Warnings Shown because of 'find_unused_parameters=True' - self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) + self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=False) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process From 4100196a6747d17902feaae34175a7c1a36ea018 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 17:06:28 -0800 Subject: [PATCH 14/44] Fized Issue with find_unused_parameters in DDIP --- Code/main.py | 28 ++++++++++++++++++++++++---- Code/training.py | 6 +++--- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/Code/main.py b/Code/main.py index f12b2e3..44e7693 100644 --- a/Code/main.py +++ b/Code/main.py @@ -83,11 +83,21 @@ def main(rank): # Method 1 : Baseline : Mean Squared Error Loss for AutoEncoder and LSTM os.makedirs('../Models/Method1', exist_ok=True) # Creating Directory for Model Saving model_save_path_ae = '../Models/Method1/model_autoencoder_m1.pth' - trainer_autoencoder_baseline = Trainer(model_autoencoder, loss_mse, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, rank=rank) + trainer_autoencoder_baseline = Trainer(model_autoencoder, + loss_mse, + optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), + model_save_path=model_save_path_ae, + rank=rank, + find_unused_parameters=False) if rank == 0: print('Method-1 AutoEncoder Trainer Initialized.') model_save_path_lstm = '../Models/Method1/model_lstm_m1.pth' - trainer_lstm_baseline = Trainer(model_lstm, loss_mse, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, rank=rank) + trainer_lstm_baseline = Trainer(model_lstm, + loss_mse, + optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), + model_save_path=model_save_path_lstm, + rank=rank, + find_unused_parameters=True) if rank == 0: print('Method-1 LSTM Trainer Initialized.') print('-'*10) # Makes Output Readable @@ -95,7 +105,12 @@ def main(rank): # Method 2 : Composite Loss (MSE + MaxEnt) for AutoEncoder and Mean Squared Error Loss for LSTM os.makedirs('../Models/Method2', exist_ok=True) # Creating Directory for Model Saving model_save_path_ae = '../Models/Method2/model_autoencoder_m2.pth' - trainer_autoencoder_m2 = Trainer(model=model_autoencoder, loss_function=loss_mep, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, rank=rank) + trainer_autoencoder_m2 = Trainer(model=model_autoencoder, + loss_function=loss_mep, + optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), + model_save_path=model_save_path_ae, + rank=rank, + find_unused_parameters=False) if rank == 0: print('Method-2 AutoEncoder Trainer Initialized.') print('Method-2 LSTM == Method-1 LSTM') @@ -106,7 +121,12 @@ def main(rank): if rank == 0: print('Method-3 AutoEncoder == Method-1 AutoEncoder') model_save_path_lstm = '../Models/Method3/model_lstm_m3.pth' - trainer_lstm_m3 = Trainer(model_lstm, loss_ssim, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, rank=rank) + trainer_lstm_m3 = Trainer(model_lstm, + loss_ssim, + optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), + model_save_path=model_save_path_lstm, + rank=rank, + find_unused_parameters=True) if rank == 0: print('Method-3 LSTM Trainer Initialized.') print('-'*10) # Makes Output Readable diff --git a/Code/training.py b/Code/training.py index 1ea4d76..9f24db3 100644 --- a/Code/training.py +++ b/Code/training.py @@ -16,7 +16,7 @@ # Define Training Class class Trainer(): - def __init__(self, model, loss_function, optimizer=None, model_save_path=None, rank=None): + def __init__(self, model, loss_function, optimizer=None, model_save_path=None, rank=None, find_unused_parameters=True): self.rank = rank # Rank of the current process self.device = torch.device(f'cuda:{rank}' if torch.cuda.is_available() else 'cpu') self.model = model.to(self.device) @@ -26,10 +26,10 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(self.model.parameters(), lr=0.001) # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: - self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=False) + self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=find_unused_parameters) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process - + def cleanup_ddp(self): if dist.is_initialized(): dist.destroy_process_group() From 9a0b5a0eb9d690f83a1a88d9535d1e9fab388191 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 17:09:29 -0800 Subject: [PATCH 15/44] Removed Debugging Statements --- Code/main.py | 12 ++++-------- Code/training.py | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Code/main.py b/Code/main.py index 44e7693..6cf173d 100644 --- a/Code/main.py +++ b/Code/main.py @@ -87,8 +87,7 @@ def main(rank): loss_mse, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, - rank=rank, - find_unused_parameters=False) + rank=rank) if rank == 0: print('Method-1 AutoEncoder Trainer Initialized.') model_save_path_lstm = '../Models/Method1/model_lstm_m1.pth' @@ -96,8 +95,7 @@ def main(rank): loss_mse, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, - rank=rank, - find_unused_parameters=True) + rank=rank) if rank == 0: print('Method-1 LSTM Trainer Initialized.') print('-'*10) # Makes Output Readable @@ -109,8 +107,7 @@ def main(rank): loss_function=loss_mep, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, - rank=rank, - find_unused_parameters=False) + rank=rank) if rank == 0: print('Method-2 AutoEncoder Trainer Initialized.') print('Method-2 LSTM == Method-1 LSTM') @@ -125,8 +122,7 @@ def main(rank): loss_ssim, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, - rank=rank, - find_unused_parameters=True) + rank=rank) if rank == 0: print('Method-3 LSTM Trainer Initialized.') print('-'*10) # Makes Output Readable diff --git a/Code/training.py b/Code/training.py index 9f24db3..86428bc 100644 --- a/Code/training.py +++ b/Code/training.py @@ -16,7 +16,7 @@ # Define Training Class class Trainer(): - def __init__(self, model, loss_function, optimizer=None, model_save_path=None, rank=None, find_unused_parameters=True): + def __init__(self, model, loss_function, optimizer=None, model_save_path=None, rank=None): self.rank = rank # Rank of the current process self.device = torch.device(f'cuda:{rank}' if torch.cuda.is_available() else 'cpu') self.model = model.to(self.device) @@ -26,7 +26,7 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(self.model.parameters(), lr=0.001) # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: - self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=find_unused_parameters) + self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process From 47ec6ed8a8736504a67b4a29c73a555647bea9f0 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 17:15:53 -0800 Subject: [PATCH 16/44] Removed Warnings from being printed on Terminal --- Code/main.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Code/main.py b/Code/main.py index 6cf173d..58abe93 100644 --- a/Code/main.py +++ b/Code/main.py @@ -17,6 +17,7 @@ import torch import torch.multiprocessing as mp import torch.distributed as dist +import warnings # Define Working Directories grayscale_dir = '../Dataset/Greyscale' @@ -219,5 +220,8 @@ def main(rank): if __name__ == '__main__': + if dist.get_rank() == 0: + # Remove Warnings + warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel.distributed') world_size = torch.cuda.device_count() # Number of available GPUs mp.spawn(main_worker, args=(world_size,), nprocs=world_size, join=True) From 06f9df777d92e1a1657d4145b053ee60b501b94e Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sat, 30 Dec 2023 17:18:55 -0800 Subject: [PATCH 17/44] Changed Warning Printing Mechanish --- Code/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Code/main.py b/Code/main.py index 58abe93..c0362b2 100644 --- a/Code/main.py +++ b/Code/main.py @@ -34,6 +34,9 @@ def main_worker(rank, world_size): torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) + if dist.get_rank() == 0: + # Remove Warnings + warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel') main(rank) # Call the existing main function. def main(rank): @@ -220,8 +223,5 @@ def main(rank): if __name__ == '__main__': - if dist.get_rank() == 0: - # Remove Warnings - warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel.distributed') world_size = torch.cuda.device_count() # Number of available GPUs - mp.spawn(main_worker, args=(world_size,), nprocs=world_size, join=True) + mp.spawn(main_worker, args=(world_size,), nprocs=world_size, join=True) \ No newline at end of file From d82c40016aebd5548341434597e65011592e8e7d Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:28:27 -0800 Subject: [PATCH 18/44] Atempt to Hide UserWarnings from Console --- Code/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Code/main.py b/Code/main.py index c0362b2..ae1b2b7 100644 --- a/Code/main.py +++ b/Code/main.py @@ -223,5 +223,8 @@ def main(rank): if __name__ == '__main__': + if dist.get_rank() == 0: + warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel.distributed') + world_size = torch.cuda.device_count() # Number of available GPUs mp.spawn(main_worker, args=(world_size,), nprocs=world_size, join=True) \ No newline at end of file From 83f9a26c0bc26471a807a123b6f5d28c030b77b7 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:32:29 -0800 Subject: [PATCH 19/44] Fixed Parameters in Trainer Initializations --- Code/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Code/main.py b/Code/main.py index ae1b2b7..8963824 100644 --- a/Code/main.py +++ b/Code/main.py @@ -87,16 +87,16 @@ def main(rank): # Method 1 : Baseline : Mean Squared Error Loss for AutoEncoder and LSTM os.makedirs('../Models/Method1', exist_ok=True) # Creating Directory for Model Saving model_save_path_ae = '../Models/Method1/model_autoencoder_m1.pth' - trainer_autoencoder_baseline = Trainer(model_autoencoder, - loss_mse, + trainer_autoencoder_baseline = Trainer(model=model_autoencoder, + loss_function=loss_mse, optimizer=torch.optim.Adam(model_autoencoder.parameters(), lr=0.001), model_save_path=model_save_path_ae, rank=rank) if rank == 0: print('Method-1 AutoEncoder Trainer Initialized.') model_save_path_lstm = '../Models/Method1/model_lstm_m1.pth' - trainer_lstm_baseline = Trainer(model_lstm, - loss_mse, + trainer_lstm_baseline = Trainer(model=model_lstm, + loss_function=loss_mse, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, rank=rank) @@ -122,8 +122,8 @@ def main(rank): if rank == 0: print('Method-3 AutoEncoder == Method-1 AutoEncoder') model_save_path_lstm = '../Models/Method3/model_lstm_m3.pth' - trainer_lstm_m3 = Trainer(model_lstm, - loss_ssim, + trainer_lstm_m3 = Trainer(model=model_lstm, + loss_function=loss_ssim, optimizer=torch.optim.Adam(model_lstm.parameters(), lr=0.001), model_save_path=model_save_path_lstm, rank=rank) From 54e8e38123ded0c55deab807498b6179ecf461ab Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:34:59 -0800 Subject: [PATCH 20/44] Fixed UserWarning for --- Code/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Code/main.py b/Code/main.py index 8963824..cb5f5d5 100644 --- a/Code/main.py +++ b/Code/main.py @@ -34,8 +34,8 @@ def main_worker(rank, world_size): torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) - if dist.get_rank() == 0: - # Remove Warnings + # Filter out the warnings after the process group has been initialized. + if rank == 0: warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel') main(rank) # Call the existing main function. From fa8e299b17fceb73db2b8f2942fb5fce32c4f2c0 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:38:55 -0800 Subject: [PATCH 21/44] Added Env Variables in Script --- Code/main.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Code/main.py b/Code/main.py index cb5f5d5..2e8290c 100644 --- a/Code/main.py +++ b/Code/main.py @@ -29,14 +29,16 @@ batch_size = 2 def main_worker(rank, world_size): + # Set environment variables + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12345' # Initialize the distributed environment. torch.manual_seed(0) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) - # Filter out the warnings after the process group has been initialized. - if rank == 0: - warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel') + # Suppress warnings after initializing the process group. + warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel') main(rank) # Call the existing main function. def main(rank): @@ -223,8 +225,5 @@ def main(rank): if __name__ == '__main__': - if dist.get_rank() == 0: - warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel.distributed') - world_size = torch.cuda.device_count() # Number of available GPUs mp.spawn(main_worker, args=(world_size,), nprocs=world_size, join=True) \ No newline at end of file From f91660201c8201ef8a621a68358a0d9019a36a9e Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:43:45 -0800 Subject: [PATCH 22/44] Updated UserWarnings --- Code/main.py | 4 ++-- Code/training.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Code/main.py b/Code/main.py index 2e8290c..146c7e3 100644 --- a/Code/main.py +++ b/Code/main.py @@ -37,8 +37,8 @@ def main_worker(rank, world_size): torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) - # Suppress warnings after initializing the process group. - warnings.filterwarnings("ignore", category=UserWarning, module='torch.nn.parallel') + # Suppress warnings about unused parameters specifically. + warnings.filterwarnings("ignore", message="*find_unused_parameters=True*", category=UserWarning, module='torch.nn.parallel') main(rank) # Call the existing main function. def main(rank): diff --git a/Code/training.py b/Code/training.py index 86428bc..9733db0 100644 --- a/Code/training.py +++ b/Code/training.py @@ -13,6 +13,7 @@ import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist +import warnings # Define Training Class class Trainer(): @@ -26,6 +27,8 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(self.model.parameters(), lr=0.001) # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: + # Suppress warnings about unused parameters specifically. + warnings.filterwarnings("ignore", message="*find_unused_parameters=True*", category=UserWarning, module='torch.nn.parallel') self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process From aad8403a9473b7cb474b107fde3add6bc59c8750 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:47:23 -0800 Subject: [PATCH 23/44] Fixed UserWarnings for find_unused_parameters --- Code/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Code/main.py b/Code/main.py index 146c7e3..80be4dd 100644 --- a/Code/main.py +++ b/Code/main.py @@ -38,7 +38,8 @@ def main_worker(rank, world_size): torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message="*find_unused_parameters=True*", category=UserWarning, module='torch.nn.parallel') + if rank == 0: + warnings.filterwarnings("ignore", message=".*find_unused_parameters=True.*") main(rank) # Call the existing main function. def main(rank): From fa5c7f2af09ca6bc4c11a45ed65872a7ee3d152a Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:50:21 -0800 Subject: [PATCH 24/44] Fixed UserWarnings for find_unused_parameters --- Code/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Code/main.py b/Code/main.py index 80be4dd..786806b 100644 --- a/Code/main.py +++ b/Code/main.py @@ -39,7 +39,7 @@ def main_worker(rank, world_size): dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) # Suppress warnings about unused parameters specifically. if rank == 0: - warnings.filterwarnings("ignore", message=".*find_unused_parameters=True.*") + warnings.filterwarnings("ignore", message=".*find_unused_parameters=True.*", category=UserWarning, module='torch.nn.parallel') main(rank) # Call the existing main function. def main(rank): From 10fd12b62a65936c7c301ae8884e79189bb59395 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:51:32 -0800 Subject: [PATCH 25/44] Fixed UserWarnings for find_unused_parameters --- Code/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Code/training.py b/Code/training.py index 9733db0..109bf33 100644 --- a/Code/training.py +++ b/Code/training.py @@ -28,7 +28,7 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message="*find_unused_parameters=True*", category=UserWarning, module='torch.nn.parallel') + warnings.filterwarnings("ignore", message=".*find_unused_parameters=True.*", category=UserWarning, module='torch.nn.parallel') self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process From fc0fa76339b484df84cd9860db45083e93062d64 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:56:43 -0800 Subject: [PATCH 26/44] Fixed UserWarnings for find_unused_parameters --- Code/main.py | 2 +- Code/training.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Code/main.py b/Code/main.py index 786806b..de78b72 100644 --- a/Code/main.py +++ b/Code/main.py @@ -39,7 +39,7 @@ def main_worker(rank, world_size): dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) # Suppress warnings about unused parameters specifically. if rank == 0: - warnings.filterwarnings("ignore", message=".*find_unused_parameters=True.*", category=UserWarning, module='torch.nn.parallel') + warnings.filterwarnings("ignore", message=".*find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass.*", category=UserWarning, module='torch.nn.parallel') main(rank) # Call the existing main function. def main(rank): diff --git a/Code/training.py b/Code/training.py index 109bf33..fd9ed55 100644 --- a/Code/training.py +++ b/Code/training.py @@ -28,7 +28,7 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message=".*find_unused_parameters=True.*", category=UserWarning, module='torch.nn.parallel') + warnings.filterwarnings("ignore", message=".*find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass.*", category=UserWarning, module='torch.nn.parallel') self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process From 455b6ee5b638f7ad7a67fd43d567047073444b7d Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:57:55 -0800 Subject: [PATCH 27/44] Fixed UserWarnings for find_unused_parameters --- Code/main.py | 2 +- Code/training.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Code/main.py b/Code/main.py index de78b72..057b96c 100644 --- a/Code/main.py +++ b/Code/main.py @@ -39,7 +39,7 @@ def main_worker(rank, world_size): dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) # Suppress warnings about unused parameters specifically. if rank == 0: - warnings.filterwarnings("ignore", message=".*find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass.*", category=UserWarning, module='torch.nn.parallel') + warnings.filterwarnings("ignore", message=".*find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass.*", category=Warning, module='torch.nn.parallel') main(rank) # Call the existing main function. def main(rank): diff --git a/Code/training.py b/Code/training.py index fd9ed55..305061c 100644 --- a/Code/training.py +++ b/Code/training.py @@ -28,7 +28,7 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message=".*find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass.*", category=UserWarning, module='torch.nn.parallel') + warnings.filterwarnings("ignore", message=".*find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass.*", category=Warning, module='torch.nn.parallel') self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process From b86d3a42675fb585a9cf551b15ce08b3c66f1853 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 11:59:40 -0800 Subject: [PATCH 28/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 3 +-- Code/training.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Code/main.py b/Code/main.py index 057b96c..f02530d 100644 --- a/Code/main.py +++ b/Code/main.py @@ -38,8 +38,7 @@ def main_worker(rank, world_size): torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) # Suppress warnings about unused parameters specifically. - if rank == 0: - warnings.filterwarnings("ignore", message=".*find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass.*", category=Warning, module='torch.nn.parallel') + warnings.filterwarnings("ignore", message="* find_unused_parameters=True *", category=Warning) main(rank) # Call the existing main function. def main(rank): diff --git a/Code/training.py b/Code/training.py index 305061c..f0f8a35 100644 --- a/Code/training.py +++ b/Code/training.py @@ -28,7 +28,7 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message=".*find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass.*", category=Warning, module='torch.nn.parallel') + warnings.filterwarnings("ignore", message="* find_unused_parameters=True *", category=Warning) self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process From 6015b6f67e0a1414d6e1ae51c7112a238692005b Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:00:49 -0800 Subject: [PATCH 29/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 2 +- Code/set_env_var.sh | 5 ----- Code/training.py | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 Code/set_env_var.sh diff --git a/Code/main.py b/Code/main.py index f02530d..59cab03 100644 --- a/Code/main.py +++ b/Code/main.py @@ -38,7 +38,7 @@ def main_worker(rank, world_size): torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message="* find_unused_parameters=True *", category=Warning) + warnings.filterwarnings("ignore", message="*find_unused_parameters=True*", module="reducer.cpp") main(rank) # Call the existing main function. def main(rank): diff --git a/Code/set_env_var.sh b/Code/set_env_var.sh deleted file mode 100644 index b7dcf76..0000000 --- a/Code/set_env_var.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -# Set environment variables -export MASTER_ADDR=localhost -export MASTER_PORT=12345 \ No newline at end of file diff --git a/Code/training.py b/Code/training.py index f0f8a35..5058d8f 100644 --- a/Code/training.py +++ b/Code/training.py @@ -27,8 +27,6 @@ def __init__(self, model, loss_function, optimizer=None, model_save_path=None, r self.optimizer = optimizer if optimizer is not None else torch.optim.Adam(self.model.parameters(), lr=0.001) # Wrap model with DDP if torch.cuda.device_count() > 1 and rank is not None: - # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message="* find_unused_parameters=True *", category=Warning) self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) # Define the path to save the model self.model_save_path = model_save_path if rank == 0 else None # Only save on master process From c9e4d9d8af0c466c50f6209a0a067031efb0d2f2 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:02:03 -0800 Subject: [PATCH 30/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Code/main.py b/Code/main.py index 59cab03..65908db 100644 --- a/Code/main.py +++ b/Code/main.py @@ -38,7 +38,7 @@ def main_worker(rank, world_size): torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message="*find_unused_parameters=True*", module="reducer.cpp") + warnings.filterwarnings("ignore", message="*find_unused_parameters=True*") main(rank) # Call the existing main function. def main(rank): From 1034b2320b73445d250613cc3f5ab7a59d22bae9 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:02:35 -0800 Subject: [PATCH 31/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Code/main.py b/Code/main.py index 65908db..f433694 100644 --- a/Code/main.py +++ b/Code/main.py @@ -38,7 +38,7 @@ def main_worker(rank, world_size): torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) # Suppress warnings about unused parameters specifically. - warnings.filterwarnings("ignore", message="*find_unused_parameters=True*") + # warnings.filterwarnings("ignore", message="*find_unused_parameters=True*") main(rank) # Call the existing main function. def main(rank): From cb5b55d789860866df04437852d2d8ea02c5fad7 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:06:12 -0800 Subject: [PATCH 32/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Code/main.py b/Code/main.py index f433694..648c20e 100644 --- a/Code/main.py +++ b/Code/main.py @@ -17,7 +17,10 @@ import torch import torch.multiprocessing as mp import torch.distributed as dist + +# Suppress warnings about find_unused_parameters=True import warnings +warnings.filterwarnings("ignore", category=UserWarning, module="torch.distributed") # Define Working Directories grayscale_dir = '../Dataset/Greyscale' @@ -37,8 +40,6 @@ def main_worker(rank, world_size): torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) - # Suppress warnings about unused parameters specifically. - # warnings.filterwarnings("ignore", message="*find_unused_parameters=True*") main(rank) # Call the existing main function. def main(rank): From e0a707c08f7fa24ee44fcedb33c4c4adefacde54 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:07:05 -0800 Subject: [PATCH 33/44] Fixed Warnings for find_unused_parameters --- Code/training.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Code/training.py b/Code/training.py index 5058d8f..81c4535 100644 --- a/Code/training.py +++ b/Code/training.py @@ -13,7 +13,10 @@ import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist + +# Suppress warnings about find_unused_parameters=True import warnings +warnings.filterwarnings("ignore", category=UserWarning, module="torch.distributed") # Define Training Class class Trainer(): From 6cb35b4bcd84e39f71108c6158fd9a326c82140e Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:08:27 -0800 Subject: [PATCH 34/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 2 +- Code/training.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Code/main.py b/Code/main.py index 648c20e..f34ed58 100644 --- a/Code/main.py +++ b/Code/main.py @@ -20,7 +20,7 @@ # Suppress warnings about find_unused_parameters=True import warnings -warnings.filterwarnings("ignore", category=UserWarning, module="torch.distributed") +warnings.filterwarnings("ignore", category=UserWarning, module="torch") # Define Working Directories grayscale_dir = '../Dataset/Greyscale' diff --git a/Code/training.py b/Code/training.py index 81c4535..1b45e96 100644 --- a/Code/training.py +++ b/Code/training.py @@ -10,13 +10,12 @@ # Import Necessary Libraries import torch -import torch.nn as nn from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist # Suppress warnings about find_unused_parameters=True import warnings -warnings.filterwarnings("ignore", category=UserWarning, module="torch.distributed") +warnings.filterwarnings("ignore", category=UserWarning, module="torch") # Define Training Class class Trainer(): From ea3bc396e1549d76c2a50b3e513f5ebd169bc295 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:15:07 -0800 Subject: [PATCH 35/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 8 ++++---- Code/training.py | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/Code/main.py b/Code/main.py index f34ed58..fd351f6 100644 --- a/Code/main.py +++ b/Code/main.py @@ -10,7 +10,6 @@ from losses import LossMSE, LossMEP, SSIMLoss from training import Trainer - # Import Necessary Libraries import os import traceback @@ -18,9 +17,10 @@ import torch.multiprocessing as mp import torch.distributed as dist -# Suppress warnings about find_unused_parameters=True -import warnings -warnings.filterwarnings("ignore", category=UserWarning, module="torch") +# Disable below 3 Lines if you want Detailed Errors and Warnings Printed on Terminal +import os +import sys +sys.stderr = open(os.devnull, 'w') # Define Working Directories grayscale_dir = '../Dataset/Greyscale' diff --git a/Code/training.py b/Code/training.py index 1b45e96..373eb6e 100644 --- a/Code/training.py +++ b/Code/training.py @@ -13,10 +13,6 @@ from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist -# Suppress warnings about find_unused_parameters=True -import warnings -warnings.filterwarnings("ignore", category=UserWarning, module="torch") - # Define Training Class class Trainer(): def __init__(self, model, loss_function, optimizer=None, model_save_path=None, rank=None): From 254c1269a22d3a9a63617192857766f7debf3b63 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:17:08 -0800 Subject: [PATCH 36/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Code/main.py b/Code/main.py index fd351f6..0da4390 100644 --- a/Code/main.py +++ b/Code/main.py @@ -10,6 +10,7 @@ from losses import LossMSE, LossMEP, SSIMLoss from training import Trainer + # Import Necessary Libraries import os import traceback @@ -17,10 +18,12 @@ import torch.multiprocessing as mp import torch.distributed as dist -# Disable below 3 Lines if you want Detailed Errors and Warnings Printed on Terminal +# Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal +import warnings import os import sys sys.stderr = open(os.devnull, 'w') +warnings.filterwarnings("ignore", category=UserWarning, module="torch") # Define Working Directories grayscale_dir = '../Dataset/Greyscale' From 598e364b8aca25ca08ddf6b68f4ad40d92a9ff18 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:18:35 -0800 Subject: [PATCH 37/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 7 ------- Code/training.py | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Code/main.py b/Code/main.py index 0da4390..778287c 100644 --- a/Code/main.py +++ b/Code/main.py @@ -18,13 +18,6 @@ import torch.multiprocessing as mp import torch.distributed as dist -# Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal -import warnings -import os -import sys -sys.stderr = open(os.devnull, 'w') -warnings.filterwarnings("ignore", category=UserWarning, module="torch") - # Define Working Directories grayscale_dir = '../Dataset/Greyscale' rgb_dir = '../Dataset/RGB' diff --git a/Code/training.py b/Code/training.py index 373eb6e..d816501 100644 --- a/Code/training.py +++ b/Code/training.py @@ -13,6 +13,13 @@ from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist +# Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal +# import warnings +import os +import sys +sys.stderr = open(os.devnull, 'w') +# warnings.filterwarnings("ignore", category=UserWarning, module="torch") + # Define Training Class class Trainer(): def __init__(self, model, loss_function, optimizer=None, model_save_path=None, rank=None): From 8f1156dee40c86c957c12befc17597a57536231c Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:19:36 -0800 Subject: [PATCH 38/44] Fixed Warnings for find_unused_parameters --- Code/training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Code/training.py b/Code/training.py index d816501..a773198 100644 --- a/Code/training.py +++ b/Code/training.py @@ -14,11 +14,11 @@ import torch.distributed as dist # Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal -# import warnings +import warnings import os import sys sys.stderr = open(os.devnull, 'w') -# warnings.filterwarnings("ignore", category=UserWarning, module="torch") +warnings.filterwarnings("ignore", category=UserWarning, module="torch") # Define Training Class class Trainer(): From cac3c481995161c5d14dd4e2556ebf2804311f55 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:28:37 -0800 Subject: [PATCH 39/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Code/main.py b/Code/main.py index 778287c..43be0eb 100644 --- a/Code/main.py +++ b/Code/main.py @@ -3,6 +3,13 @@ -------------------------------------------------------------------------------- ''' +# Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal +import warnings +warnings.filterwarnings("ignore", category=UserWarning, module="torch") +import os +import sys +sys.stderr = open(os.devnull, 'w') + # Importing Custom Modules from data import CustomDataset from autoencoder_model import Grey2RGBAutoEncoder @@ -10,7 +17,6 @@ from losses import LossMSE, LossMEP, SSIMLoss from training import Trainer - # Import Necessary Libraries import os import traceback From 5529513cd25a0fe1b9eb72dcf333e2ccaabdbbcc Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:29:51 -0800 Subject: [PATCH 40/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 2 +- Code/training.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Code/main.py b/Code/main.py index 43be0eb..1601427 100644 --- a/Code/main.py +++ b/Code/main.py @@ -5,9 +5,9 @@ # Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal import warnings -warnings.filterwarnings("ignore", category=UserWarning, module="torch") import os import sys +warnings.filterwarnings("ignore", category=UserWarning, module="torch") sys.stderr = open(os.devnull, 'w') # Importing Custom Modules diff --git a/Code/training.py b/Code/training.py index a773198..1fd555a 100644 --- a/Code/training.py +++ b/Code/training.py @@ -8,6 +8,13 @@ Initialize Best Validation Loss to Infinity as we will save model with lowest validation loss ''' +# Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal +import warnings +import os +import sys +warnings.filterwarnings("ignore", category=UserWarning, module="torch") + +sys.stderr = open(os.devnull, 'w') # Import Necessary Libraries import torch from torch.nn.parallel import DistributedDataParallel as DDP From 290180496a5ad4111582e43af20070fb0eb08bc1 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:32:43 -0800 Subject: [PATCH 41/44] Fixed Warnings for find_unused_parameters --- Code/main.py | 7 +++---- Code/training.py | 7 ------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/Code/main.py b/Code/main.py index 1601427..751f918 100644 --- a/Code/main.py +++ b/Code/main.py @@ -3,12 +3,11 @@ -------------------------------------------------------------------------------- ''' -# Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal -import warnings +# Disable below 4 Lines if you want Detailed Errors and Warnings Printed on Terminal import os import sys -warnings.filterwarnings("ignore", category=UserWarning, module="torch") -sys.stderr = open(os.devnull, 'w') +sys.stdout = open('/dev/null', 'w') +sys.stderr = open('/dev/null', 'w') # Importing Custom Modules from data import CustomDataset diff --git a/Code/training.py b/Code/training.py index 1fd555a..a773198 100644 --- a/Code/training.py +++ b/Code/training.py @@ -8,13 +8,6 @@ Initialize Best Validation Loss to Infinity as we will save model with lowest validation loss ''' -# Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal -import warnings -import os -import sys -warnings.filterwarnings("ignore", category=UserWarning, module="torch") - -sys.stderr = open(os.devnull, 'w') # Import Necessary Libraries import torch from torch.nn.parallel import DistributedDataParallel as DDP From 5f28ffaf986326bdcb0e8393de8500c5298a587f Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Sun, 31 Dec 2023 13:36:41 -0800 Subject: [PATCH 42/44] Run : python main.py 2>/dev/null --- Code/main.py | 6 ------ Code/training.py | 7 ------- 2 files changed, 13 deletions(-) diff --git a/Code/main.py b/Code/main.py index 751f918..5dc44bd 100644 --- a/Code/main.py +++ b/Code/main.py @@ -3,12 +3,6 @@ -------------------------------------------------------------------------------- ''' -# Disable below 4 Lines if you want Detailed Errors and Warnings Printed on Terminal -import os -import sys -sys.stdout = open('/dev/null', 'w') -sys.stderr = open('/dev/null', 'w') - # Importing Custom Modules from data import CustomDataset from autoencoder_model import Grey2RGBAutoEncoder diff --git a/Code/training.py b/Code/training.py index a773198..373eb6e 100644 --- a/Code/training.py +++ b/Code/training.py @@ -13,13 +13,6 @@ from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist -# Disable below 5 Lines if you want Detailed Errors and Warnings Printed on Terminal -import warnings -import os -import sys -sys.stderr = open(os.devnull, 'w') -warnings.filterwarnings("ignore", category=UserWarning, module="torch") - # Define Training Class class Trainer(): def __init__(self, model, loss_function, optimizer=None, model_save_path=None, rank=None): From 70742a1d8b9f12c4faf25e4ddb60d0ab7921df77 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Tue, 2 Jan 2024 00:10:38 +0000 Subject: [PATCH 43/44] Changes AutoEncoder Hidden Layers --- Code/autoencoder_model.py | 4 ++-- Code/main.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Code/autoencoder_model.py b/Code/autoencoder_model.py index 975d07e..c4e34d2 100644 --- a/Code/autoencoder_model.py +++ b/Code/autoencoder_model.py @@ -16,9 +16,9 @@ class Grey2RGBAutoEncoder(nn.Module): def __init__(self): super(Grey2RGBAutoEncoder, self).__init__() # Define the Encoder - self.encoder = self._make_layers([1, 64, 128, 256]) + self.encoder = self._make_layers([1, 8, 16, 32]) # Define the Decoder - self.decoder = self._make_layers([256, 128, 64, 3], decoder=True) + self.decoder = self._make_layers([32, 16, 8, 3], decoder=True) # Helper function to create the encoder or decoder layers. def _make_layers(self, channels, decoder=False): diff --git a/Code/main.py b/Code/main.py index 5dc44bd..e09b6aa 100644 --- a/Code/main.py +++ b/Code/main.py @@ -22,8 +22,8 @@ rgb_dir = '../Dataset/RGB' # Define Universal Parameters -image_height = 400 -image_width = 600 +image_height = 3000 +image_width = 4500 batch_size = 2 def main_worker(rank, world_size): From 290f3a7e93a5bbc50a77fc53f85b255cbdc01a92 Mon Sep 17 00:00:00 2001 From: iSiddharth20 Date: Mon, 1 Jan 2024 18:15:07 -0800 Subject: [PATCH 44/44] Added Platform Check for init_process_group backend parameter initialization --- Code/main.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Code/main.py b/Code/main.py index e09b6aa..883d85b 100644 --- a/Code/main.py +++ b/Code/main.py @@ -16,16 +16,24 @@ import torch import torch.multiprocessing as mp import torch.distributed as dist +import platform # Define Working Directories grayscale_dir = '../Dataset/Greyscale' rgb_dir = '../Dataset/RGB' # Define Universal Parameters -image_height = 3000 -image_width = 4500 +image_height = 4000 +image_width = 6000 batch_size = 2 +def get_backend(): + system_type = platform.system() + if system_type == "Linux": + return "nccl" + else: + return "gloo" + def main_worker(rank, world_size): # Set environment variables os.environ['MASTER_ADDR'] = 'localhost' @@ -34,7 +42,7 @@ def main_worker(rank, world_size): torch.manual_seed(0) torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True - dist.init_process_group(backend="nccl", init_method="env://", world_size=world_size, rank=rank) + dist.init_process_group(backend=get_backend(), init_method="env://", world_size=world_size, rank=rank) main(rank) # Call the existing main function. def main(rank):