Skip to content

Commit

Permalink
fix nanotron (#283)
Browse files Browse the repository at this point in the history
* fix nanotron

* fix nanotron

* fix nanotron

* Update examples/nanotron/lighteval_config_override_template.yaml

Co-authored-by: Hynek Kydlíček <[email protected]>

---------

Co-authored-by: Hynek Kydlíček <[email protected]>
  • Loading branch information
NathanHB and hynky1999 authored Sep 2, 2024
1 parent 24adaa2 commit cdeb6c2
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 39 deletions.
54 changes: 25 additions & 29 deletions examples/nanotron/lighteval_config_override_template.yaml
Original file line number Diff line number Diff line change
@@ -1,29 +1,25 @@
lighteval:
batch_size: 16
checkpoints_path: null
generation: null
logging:
hub_repo_details: null
hub_repo_results: null
hub_repo_tensorboard: HuggingFaceBR4/thomwolf-webdata-std-two
local_output_path: /scratch/thomwolf/lighteval/webdata-std-two-1p82G-wet_files_1-seed-5-698496
push_details_to_hub: false
push_results_to_hub: false
push_results_to_tensorboard: true
tensorboard_metric_prefix: e
parallelism:
dp: 1
pp: 1
pp_engine: 1f1b
recompute_granularity: null
tp: 1
tp_linear_async_communication: false
tp_mode: ALL_REDUCE
tasks:
custom_tasks: /fsx/thomwolf/github/lighteval/tasks_examples/custom_tasks/custom_evaluation_tasks.py
dataset_loading_processes: 8
max_samples: 10
multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: early-signal
# tasks: custom|hellaswag|0
batch_size: 16
checkpoints_path: null
generation: null
logging:
hub_repo_details: null
hub_repo_results: null
hub_repo_tensorboard: null
local_output_path: ./output_dir
push_details_to_hub: false
push_results_to_hub: false
push_results_to_tensorboard: true
tensorboard_metric_prefix: e
parallelism:
dp: 1
pp: 1
pp_engine: 1f1b
tp: 1
tp_linear_async_communication: false
tp_mode: ALL_REDUCE
tasks:
dataset_loading_processes: 8
max_samples: 10
multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: lighteval|gsm8k|5|1
6 changes: 4 additions & 2 deletions src/lighteval/models/nanotron_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def device(self) -> Union[int, str, torch.device]:
return "cuda"

def _get_batch_size(self, max_input_length: int, override_bs: int = 0, starting_batch_size: int = 512) -> int:
if override_bs > 0:
if override_bs:
return override_bs
logger.warning("Detecting largest batch size")

Expand Down Expand Up @@ -1155,7 +1155,9 @@ def greedy_until(
max_input_length = min(len(context_enc) + max_gen, self.max_length)

batch_size = self._get_batch_size(
override_bs=override_bs, max_input_length=max_input_length, starting_batch_size=starting_batch_size
override_bs=self._batch_size,
max_input_length=max_input_length,
starting_batch_size=starting_batch_size,
)
# For next iteration, since the batch will be smaller, we'll test a bigger batch size
starting_batch_size = batch_size * 2
Expand Down
15 changes: 7 additions & 8 deletions src/lighteval/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ def __init__(
tasks: str,
pipeline_parameters: PipelineParameters,
evaluation_tracker: EvaluationTracker,
model=None,
model_config=None,
model=None,
):
if not (model or model_config):
raise ValueError("Must provide either a model or model config when creating a pipeline.")
Expand All @@ -116,10 +116,9 @@ def __init__(
"WARNING: --max_samples WAS SET. THESE NUMBERS ARE ONLY PARTIAL AND SHOULD NOT BE USED FOR COMPARISON UNLESS YOU KNOW WHAT YOU ARE DOING."
)

self.accelerator, self.parallel_context = self._init_parallelism_manager()

self.evaluation_tracker = evaluation_tracker
self.model_config = model_config
self.evaluation_tracker = evaluation_tracker
self.accelerator, self.parallel_context = self._init_parallelism_manager()
self.model = self._init_model(model_config, model)

self.evaluation_tracker.general_config_logger.log_model_info(self.model.model_info)
Expand All @@ -141,9 +140,9 @@ def _init_parallelism_manager(self):
raise ValueError("You are trying to launch a nanotron model, but nanotron is not installed")
dist.initialize_torch_distributed()
parallel_context = ParallelContext(
tensor_parallel_size=self.model_config.parallelism.tp,
pipeline_parallel_size=self.model_config.parallelism.pp,
data_parallel_size=self.model_config.parallelism.dp,
tensor_parallel_size=self.model_config.lighteval.parallelism.tp,
pipeline_parallel_size=self.model_config.lighteval.parallelism.pp,
data_parallel_size=self.model_config.lighteval.parallelism.dp,
)
test_all_gather(parallel_context=parallel_context)

Expand All @@ -156,7 +155,7 @@ def _init_model(self, model_config, model):
return NanotronLightevalModel(
checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path),
nanotron_config=self.model_config,
parallel_context=self.accelerator,
parallel_context=self.parallel_context,
debug_one_layer_model=False,
model_class=None,
env_config=self.pipeline_parameters.env_config,
Expand Down

0 comments on commit cdeb6c2

Please sign in to comment.