diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 7d68cb4761a0..8102d179757e 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -1314,7 +1314,7 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: return batch -def make_supervised_data_module(tokenizer, image_processor, model_cfg, data_file=None) -> Dict: +def make_supervised_data_module(tokenizer, image_processor, model_cfg, each_file_from_path=None) -> Dict: """Make dataset and collator for supervised fine-tuning.""" data_cfg = model_cfg.data mm_cfg = model_cfg.mm_cfg @@ -1322,7 +1322,7 @@ def make_supervised_data_module(tokenizer, image_processor, model_cfg, data_file if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False): add_extra_token = 0 crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224)) - data_path = data_file if data_file is not None else data_cfg.data_path + data_path = each_file_from_path if each_file_from_path is not None else data_cfg.data_path train_dataset = NevaDataset( tokenizer=tokenizer, data_path=data_path, diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 901a8dce8bcc..e36634d23b2c 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -1268,14 +1268,14 @@ def build_train_valid_test_datasets_blend(self): if len(data_cfg.concat_sampling_probabilities) != len(data_cfg.data_path): raise ValueError( - f"concat_sampling_probabilities must be of the same size as data_file_names. " + f"concat_sampling_probabilities must be of the same size as number of files from data path. " f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.data_path)}" ) - for data_file in data_cfg.data_path: + for each_file_from_path in data_cfg.data_path: if is_packed_sequence: - train_dataset = NevaPackedSeqDatatset(data_file, self.cfg.mm_cfg.vision_encoder.get("crop_size")) - valid_dataset = NevaPackedSeqDatatset(data_file, self.cfg.mm_cfg.vision_encoder.get("crop_size")) + train_dataset = NevaPackedSeqDatatset(each_file_from_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")) + valid_dataset = NevaPackedSeqDatatset(each_file_from_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")) else: ds_dict = make_supervised_data_module( tokenizer=self.tokenizer, @@ -1285,7 +1285,7 @@ def build_train_valid_test_datasets_blend(self): else self.model.image_processor ), model_cfg=self.cfg, - data_file=data_file, + each_file_from_path=each_file_from_path, ) train_dataset = ds_dict["train_dataset"] valid_dataset = ds_dict["eval_dataset"] @@ -1337,8 +1337,14 @@ def build_train_valid_test_datasets(self): self.cfg.data.concat_sampling_probabilities = [1 / len(self.cfg.data.data_path)] * len( self.cfg.data.data_path ) - elif sum(self.cfg.data.concat_sampling_probabilities) != 1: - raise ValueError("Concat_sampling_probabilities must sum up to 1.") + else: + # Normalize the sampling probabilities if they don't sum to 1 + total = sum(self.cfg.data.concat_sampling_probabilities) + if total != 1: + logging.warning(f"Concat_sampling_probabilities sum to {total}. Normalizing to sum to 1.") + self.cfg.data.concat_sampling_probabilities = [ + prob / total for prob in self.cfg.data.concat_sampling_probabilities + ] return self.build_train_valid_test_datasets_blend() elif len(self.cfg.data.data_path) == 1: if self.cfg.data.concat_sampling_probabilities is not None: