We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /home/dev/project/chatglm2_finetuning/train.py:118 in │ │ │ │ 115 │ ) │ │ 116 │ │ │ 117 │ if train_datasets is not None: │ │ ❱ 118 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │ │ 119 │ │ 120 │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /trainer/trainer.py:520 in fit │ │ │ │ 517 │ │ """ │ │ 518 │ │ model = _maybe_unwrap_optimized(model) │ │ 519 │ │ self.strategy._lightning_module = model │ │ ❱ 520 │ │ call._call_and_handle_interrupt( │ │ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │ │ 522 │ │ ) │ │ 523 │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /trainer/call.py:42 in _call_and_handle_interrupt │ │ │ │ 39 │ """ │ │ 40 │ try: │ │ 41 │ │ if trainer.strategy.launcher is not None: │ │ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │ │ 43 │ │ else: │ │ 44 │ │ │ return trainer_fn(*args, **kwargs) │ │ 45 │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /strategies/launchers/subprocess_script.py:92 in launch │ │ │ │ 89 │ │ """ │ │ 90 │ │ if not self.cluster_environment.creates_processes_externally: │ │ 91 │ │ │ self._call_children_scripts() │ │ ❱ 92 │ │ return function(*args, **kwargs) │ │ 93 │ │ │ 94 │ def kill(self, signum: _SIGNUM) -> None: │ │ 95 │ │ for proc in self.procs: │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /trainer/trainer.py:559 in _fit_impl │ │ │ │ 556 │ │ │ model_provided=True, │ │ 557 │ │ │ model_connected=self.lightning_module is not None, │ │ 558 │ │ ) │ │ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │ │ 560 │ │ │ │ 561 │ │ assert self.state.stopped │ │ 562 │ │ self.training = False │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /trainer/trainer.py:911 in _run │ │ │ │ 908 │ │ self._logger_connector.reset_metrics() │ │ 909 │ │ │ │ 910 │ │ # strategy will configure model and move it to the device │ │ ❱ 911 │ │ self.strategy.setup(self) │ │ 912 │ │ │ │ 913 │ │ # hook │ │ 914 │ │ if self.state.fn == TrainerFn.FITTING: │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /strategies/deepspeed.py:344 in setup │ │ │ │ 341 │ │ self.setup_optimizers(trainer) │ │ 342 │ │ self.setup_precision_plugin() │ │ 343 │ │ _optimizers_to_device(self.optimizers, self.root_device) │ │ ❱ 344 │ │ self.init_deepspeed() │ │ 345 │ │ self.barrier() │ │ 346 │ │ │ 347 │ def _init_deepspeed_distributed(self) -> None: │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /strategies/deepspeed.py:448 in init_deepspeed │ │ │ │ 445 │ │ model = _LightningModuleWrapperBase(forward_module=self.model) │ │ 446 │ │ │ │ 447 │ │ if self.lightning_module.trainer and self.lightning_module.tra │ │ ❱ 448 │ │ │ self._initialize_deepspeed_train(model) │ │ 449 │ │ else: │ │ 450 │ │ │ self._initialize_deepspeed_inference(model) │ │ 451 │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /strategies/deepspeed.py:484 in _initialize_deepspeed_train │ │ │ │ 481 │ │ │ if lr_scheduler is not None: │ │ 482 │ │ │ │ scheduler = lr_scheduler.scheduler │ │ 483 │ │ │ │ ❱ 484 │ │ model, deepspeed_optimizer = self._setup_model_and_optimizer(m │ │ 485 │ │ self._set_deepspeed_activation_checkpointing() │ │ 486 │ │ │ │ 487 │ │ # although we set these here, deepspeed manages the specific o │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │ │ /strategies/deepspeed.py:413 in _setup_model_and_optimizer │ │ │ │ 410 │ │ import deepspeed │ │ 411 │ │ │ │ 412 │ │ model_parameters = filter(lambda p: p.requires_grad, model.par │ │ ❱ 413 │ │ deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initia │ │ 414 │ │ │ args=argparse.Namespace(device_rank=self.root_device.index │ │ 415 │ │ │ config=self.config, │ │ 416 │ │ │ model=model, │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/_init │ │ _.py:165 in initialize │ │ │ │ 162 │ │ │ │ │ │ │ │ │ │ config=config, │ │ 163 │ │ │ │ │ │ │ │ │ │ config_class=config_class) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ engine = DeepSpeedEngine(args=args, │ │ 166 │ │ │ │ │ │ │ │ │ model=model, │ │ 167 │ │ │ │ │ │ │ │ │ optimizer=optimizer, │ │ 168 │ │ │ │ │ │ │ │ │ model_parameters=model_parameters │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │ │ /engine.py:309 in init │ │ │ │ 306 │ │ │ model_parameters = list(model_parameters) │ │ 307 │ │ │ │ 308 │ │ if has_optimizer: │ │ ❱ 309 │ │ │ self._configure_optimizer(optimizer, model_parameters) │ │ 310 │ │ │ self._configure_lr_scheduler(lr_scheduler) │ │ 311 │ │ │ self._report_progress(0) │ │ 312 │ │ elif self.zero_optimization(): │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │ │ /engine.py:1184 in _configure_optimizer │ │ │ │ 1181 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │ │ 1182 │ │ │ │ 1183 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │ │ ❱ 1184 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │ │ 1185 │ │ elif optimizer_wrapper == AMP: │ │ 1186 │ │ │ amp_params = self.amp_params() │ │ 1187 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │ │ /engine.py:1419 in _configure_zero_optimizer │ │ │ │ 1416 │ │ │ │ if overlap_comm: │ │ 1417 │ │ │ │ │ logger.warning("Pipeline parallelism does not sup │ │ 1418 │ │ │ │ │ overlap_comm = False │ │ ❱ 1419 │ │ │ optimizer = DeepSpeedZeroOptimizer( │ │ 1420 │ │ │ │ optimizer, │ │ 1421 │ │ │ │ self.param_names, │ │ 1422 │ │ │ │ timers=timers, │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │ │ /zero/stage_1_and_2.py:312 in init │ │ │ │ 309 │ │ │ │ │ 310 │ │ │ # create flat buffer in CPU and move to GPU │ │ 311 │ │ │ self.bit16_groups_flat.append( │ │ ❱ 312 │ │ │ │ self.flatten_dense_tensors_aligned( │ │ 313 │ │ │ │ │ self.round_robin_bit16_groups[i], │ │ 314 │ │ │ │ │ self.nccl_start_alignment_factor * dist.get_world │ │ 315 │ │ │ │ │ │ get_accelerator().current_device_name())) │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │ │ /zero/stage_1_and_2.py:829 in flatten_dense_tensors_aligned │ │ │ │ 826 │ │ │ 827 │ # create a flat tensor aligned at the alignment boundary │ │ 828 │ def flatten_dense_tensors_aligned(self, tensor_list, alignment): │ │ ❱ 829 │ │ return self.flatten(align_dense_tensors(tensor_list, alignmen │ │ 830 │ │ │ 831 │ ############### Independent Partition Gradient ################## │ │ 832 │ def reduce_independent_p_g_buckets_and_remove_grads(self, param, │ │ │ │ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/torch/_utils.py:4 │ │ 59 in _flatten_dense_tensors │ │ │ │ 456 │ Returns: │ │ 457 │ │ A contiguous 1D buffer containing input tensors. │ │ 458 │ """ │ │ ❱ 459 │ return torch._C._nn.flatten_dense_tensors(tensors) │ │ 460 │ │ 461 │ │ 462 def _flatten_sparse_tensors(tensors): │ ╰──────────────────────────────────────────────────────────────────────────────
The text was updated successfully, but these errors were encountered:
chatglm2_finetuning/config/main.py
Line 70 in 74be888
ptv2 已修改为 使用 deepspeed_offload.json
Sorry, something went wrong.
No branches or pull requests
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /home/dev/project/chatglm2_finetuning/train.py:118 in │
│ │
│ 115 │ ) │
│ 116 │ │
│ 117 │ if train_datasets is not None: │
│ ❱ 118 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 119 │
│ 120 │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /trainer/trainer.py:520 in fit │
│ │
│ 517 │ │ """ │
│ 518 │ │ model = _maybe_unwrap_optimized(model) │
│ 519 │ │ self.strategy._lightning_module = model │
│ ❱ 520 │ │ call._call_and_handle_interrupt( │
│ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 522 │ │ ) │
│ 523 │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /trainer/call.py:42 in _call_and_handle_interrupt │
│ │
│ 39 │ """ │
│ 40 │ try: │
│ 41 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 43 │ │ else: │
│ 44 │ │ │ return trainer_fn(*args, **kwargs) │
│ 45 │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/launchers/subprocess_script.py:92 in launch │
│ │
│ 89 │ │ """ │
│ 90 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 91 │ │ │ self._call_children_scripts() │
│ ❱ 92 │ │ return function(*args, **kwargs) │
│ 93 │ │
│ 94 │ def kill(self, signum: _SIGNUM) -> None: │
│ 95 │ │ for proc in self.procs: │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /trainer/trainer.py:559 in _fit_impl │
│ │
│ 556 │ │ │ model_provided=True, │
│ 557 │ │ │ model_connected=self.lightning_module is not None, │
│ 558 │ │ ) │
│ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 560 │ │ │
│ 561 │ │ assert self.state.stopped │
│ 562 │ │ self.training = False │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /trainer/trainer.py:911 in _run │
│ │
│ 908 │ │ self._logger_connector.reset_metrics() │
│ 909 │ │ │
│ 910 │ │ # strategy will configure model and move it to the device │
│ ❱ 911 │ │ self.strategy.setup(self) │
│ 912 │ │ │
│ 913 │ │ # hook │
│ 914 │ │ if self.state.fn == TrainerFn.FITTING: │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/deepspeed.py:344 in setup │
│ │
│ 341 │ │ self.setup_optimizers(trainer) │
│ 342 │ │ self.setup_precision_plugin() │
│ 343 │ │ _optimizers_to_device(self.optimizers, self.root_device) │
│ ❱ 344 │ │ self.init_deepspeed() │
│ 345 │ │ self.barrier() │
│ 346 │ │
│ 347 │ def _init_deepspeed_distributed(self) -> None: │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/deepspeed.py:448 in init_deepspeed │
│ │
│ 445 │ │ model = _LightningModuleWrapperBase(forward_module=self.model) │
│ 446 │ │ │
│ 447 │ │ if self.lightning_module.trainer and self.lightning_module.tra │
│ ❱ 448 │ │ │ self._initialize_deepspeed_train(model) │
│ 449 │ │ else: │
│ 450 │ │ │ self._initialize_deepspeed_inference(model) │
│ 451 │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/deepspeed.py:484 in _initialize_deepspeed_train │
│ │
│ 481 │ │ │ if lr_scheduler is not None: │
│ 482 │ │ │ │ scheduler = lr_scheduler.scheduler │
│ 483 │ │ │
│ ❱ 484 │ │ model, deepspeed_optimizer = self._setup_model_and_optimizer(m │
│ 485 │ │ self._set_deepspeed_activation_checkpointing() │
│ 486 │ │ │
│ 487 │ │ # although we set these here, deepspeed manages the specific o │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/lightning/pytorch │
│ /strategies/deepspeed.py:413 in _setup_model_and_optimizer │
│ │
│ 410 │ │ import deepspeed │
│ 411 │ │ │
│ 412 │ │ model_parameters = filter(lambda p: p.requires_grad, model.par │
│ ❱ 413 │ │ deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initia │
│ 414 │ │ │ args=argparse.Namespace(device_rank=self.root_device.index │
│ 415 │ │ │ config=self.config, │
│ 416 │ │ │ model=model, │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/_init │
│ _.py:165 in initialize │
│ │
│ 162 │ │ │ │ │ │ │ │ │ │ config=config, │
│ 163 │ │ │ │ │ │ │ │ │ │ config_class=config_class) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ engine = DeepSpeedEngine(args=args, │
│ 166 │ │ │ │ │ │ │ │ │ model=model, │
│ 167 │ │ │ │ │ │ │ │ │ optimizer=optimizer, │
│ 168 │ │ │ │ │ │ │ │ │ model_parameters=model_parameters │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /engine.py:309 in init │
│ │
│ 306 │ │ │ model_parameters = list(model_parameters) │
│ 307 │ │ │
│ 308 │ │ if has_optimizer: │
│ ❱ 309 │ │ │ self._configure_optimizer(optimizer, model_parameters) │
│ 310 │ │ │ self._configure_lr_scheduler(lr_scheduler) │
│ 311 │ │ │ self._report_progress(0) │
│ 312 │ │ elif self.zero_optimization(): │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /engine.py:1184 in _configure_optimizer │
│ │
│ 1181 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │
│ 1182 │ │ │
│ 1183 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │
│ ❱ 1184 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │
│ 1185 │ │ elif optimizer_wrapper == AMP: │
│ 1186 │ │ │ amp_params = self.amp_params() │
│ 1187 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /engine.py:1419 in _configure_zero_optimizer │
│ │
│ 1416 │ │ │ │ if overlap_comm: │
│ 1417 │ │ │ │ │ logger.warning("Pipeline parallelism does not sup │
│ 1418 │ │ │ │ │ overlap_comm = False │
│ ❱ 1419 │ │ │ optimizer = DeepSpeedZeroOptimizer( │
│ 1420 │ │ │ │ optimizer, │
│ 1421 │ │ │ │ self.param_names, │
│ 1422 │ │ │ │ timers=timers, │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /zero/stage_1_and_2.py:312 in init │
│ │
│ 309 │ │ │ │
│ 310 │ │ │ # create flat buffer in CPU and move to GPU │
│ 311 │ │ │ self.bit16_groups_flat.append( │
│ ❱ 312 │ │ │ │ self.flatten_dense_tensors_aligned( │
│ 313 │ │ │ │ │ self.round_robin_bit16_groups[i], │
│ 314 │ │ │ │ │ self.nccl_start_alignment_factor * dist.get_world │
│ 315 │ │ │ │ │ │ get_accelerator().current_device_name())) │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/deepspeed/runtime │
│ /zero/stage_1_and_2.py:829 in flatten_dense_tensors_aligned │
│ │
│ 826 │ │
│ 827 │ # create a flat tensor aligned at the alignment boundary │
│ 828 │ def flatten_dense_tensors_aligned(self, tensor_list, alignment): │
│ ❱ 829 │ │ return self.flatten(align_dense_tensors(tensor_list, alignmen │
│ 830 │ │
│ 831 │ ############### Independent Partition Gradient ################## │
│ 832 │ def reduce_independent_p_g_buckets_and_remove_grads(self, param, │
│ │
│ /home/dev/anaconda3/envs/glm2/lib/python3.10/site-packages/torch/_utils.py:4 │
│ 59 in _flatten_dense_tensors │
│ │
│ 456 │ Returns: │
│ 457 │ │ A contiguous 1D buffer containing input tensors. │
│ 458 │ """ │
│ ❱ 459 │ return torch._C._nn.flatten_dense_tensors(tensors) │
│ 460 │
│ 461 │
│ 462 def _flatten_sparse_tensors(tensors): │
╰──────────────────────────────────────────────────────────────────────────────
The text was updated successfully, but these errors were encountered: