Skip to content

Commit

Permalink
address comments
Browse files Browse the repository at this point in the history
Signed-off-by: Vivian Chen <[email protected]>
  • Loading branch information
xuanzic committed Aug 12, 2024
1 parent 470a19f commit f1a59c4
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
4 changes: 2 additions & 2 deletions nemo/collections/multimodal/data/neva/neva_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1314,15 +1314,15 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
return batch


def make_supervised_data_module(tokenizer, image_processor, model_cfg, data_file=None) -> Dict:
def make_supervised_data_module(tokenizer, image_processor, model_cfg, each_file_from_path=None) -> Dict:
"""Make dataset and collator for supervised fine-tuning."""
data_cfg = model_cfg.data
mm_cfg = model_cfg.mm_cfg
add_extra_token = 1
if getattr(model_cfg, 'no_seqlen_plus_one_input_tokens', False):
add_extra_token = 0
crop_size = mm_cfg.vision_encoder.get("crop_size", (224, 224))
data_path = data_file if data_file is not None else data_cfg.data_path
data_path = each_file_from_path if each_file_from_path is not None else data_cfg.data_path
train_dataset = NevaDataset(
tokenizer=tokenizer,
data_path=data_path,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1268,14 +1268,14 @@ def build_train_valid_test_datasets_blend(self):

if len(data_cfg.concat_sampling_probabilities) != len(data_cfg.data_path):
raise ValueError(
f"concat_sampling_probabilities must be of the same size as data_file_names. "
f"concat_sampling_probabilities must be of the same size as number of files from data path. "
f"Provided size {len(data_cfg.concat_sampling_probabilities)}, number of datasets {len(data_cfg.data_path)}"
)

for data_file in data_cfg.data_path:
for each_file_from_path in data_cfg.data_path:
if is_packed_sequence:
train_dataset = NevaPackedSeqDatatset(data_file, self.cfg.mm_cfg.vision_encoder.get("crop_size"))
valid_dataset = NevaPackedSeqDatatset(data_file, self.cfg.mm_cfg.vision_encoder.get("crop_size"))
train_dataset = NevaPackedSeqDatatset(each_file_from_path, self.cfg.mm_cfg.vision_encoder.get("crop_size"))
valid_dataset = NevaPackedSeqDatatset(each_file_from_path, self.cfg.mm_cfg.vision_encoder.get("crop_size"))
else:
ds_dict = make_supervised_data_module(
tokenizer=self.tokenizer,
Expand All @@ -1285,7 +1285,7 @@ def build_train_valid_test_datasets_blend(self):
else self.model.image_processor
),
model_cfg=self.cfg,
data_file=data_file,
each_file_from_path=each_file_from_path,
)
train_dataset = ds_dict["train_dataset"]
valid_dataset = ds_dict["eval_dataset"]
Expand Down Expand Up @@ -1337,8 +1337,14 @@ def build_train_valid_test_datasets(self):
self.cfg.data.concat_sampling_probabilities = [1 / len(self.cfg.data.data_path)] * len(
self.cfg.data.data_path
)
elif sum(self.cfg.data.concat_sampling_probabilities) != 1:
raise ValueError("Concat_sampling_probabilities must sum up to 1.")
else:
# Normalize the sampling probabilities if they don't sum to 1
total = sum(self.cfg.data.concat_sampling_probabilities)
if total != 1:
logging.warning(f"Concat_sampling_probabilities sum to {total}. Normalizing to sum to 1.")
self.cfg.data.concat_sampling_probabilities = [
prob / total for prob in self.cfg.data.concat_sampling_probabilities
]
return self.build_train_valid_test_datasets_blend()
elif len(self.cfg.data.data_path) == 1:
if self.cfg.data.concat_sampling_probabilities is not None:
Expand Down

0 comments on commit f1a59c4

Please sign in to comment.