You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
if global_args['load_in_4bit']:
global_args['quantization_config'] = None
#检查lora adalora是否开启
if 'lora' not in train_info_args and 'adalora' not in train_info_args:
raise ValueError('please config lora or adalora')
if train_info_args.get('lora',{}).get('with_lora',False) and train_info_args.get('adalora',{}).get('with_lora',False):
raise Exception('lora and adalora can set one at same time !')
train_info_args.pop('prompt', None)
训练命令如下:
CUDA_VISIBLE_DEVICES=0,1 python train.py
报错信息如下:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /chatglm2-dev/train.py:122 in │
│ │
│ 119 │ ) │
│ 120 │ │
│ 121 │ if train_datasets is not None: │
│ ❱ 122 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 123 │
│ 124 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 520 in fit │
│ │
│ 517 │ │ """ │
│ 518 │ │ model = _maybe_unwrap_optimized(model) │
│ 519 │ │ self.strategy._lightning_module = model │
│ ❱ 520 │ │ call._call_and_handle_interrupt( │
│ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 522 │ │ ) │
│ 523 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:42 │
│ in _call_and_handle_interrupt │
│ │
│ 39 │ """ │
│ 40 │ try: │
│ 41 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 43 │ │ else: │
│ 44 │ │ │ return trainer_fn(*args, **kwargs) │
│ 45 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/launcher │
│ s/subprocess_script.py:92 in launch │
│ │
│ 89 │ │ """ │
│ 90 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 91 │ │ │ self._call_children_scripts() │
│ ❱ 92 │ │ return function(*args, **kwargs) │
│ 93 │ │
│ 94 │ def kill(self, signum: _SIGNUM) -> None: │
│ 95 │ │ for proc in self.procs: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 559 in _fit_impl │
│ │
│ 556 │ │ │ model_provided=True, │
│ 557 │ │ │ model_connected=self.lightning_module is not None, │
│ 558 │ │ ) │
│ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 560 │ │ │
│ 561 │ │ assert self.state.stopped │
│ 562 │ │ self.training = False │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 935 in _run │
│ │
│ 932 │ │ # ---------------------------- │
│ 933 │ │ # RUN THE TRAINER │
│ 934 │ │ # ---------------------------- │
│ ❱ 935 │ │ results = self._run_stage() │
│ 936 │ │ │
│ 937 │ │ # ---------------------------- │
│ 938 │ │ # POST-Training CLEAN UP │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 978 in _run_stage │
│ │
│ 975 │ │ │ with isolate_rng(): │
│ 976 │ │ │ │ self._run_sanity_check() │
│ 977 │ │ │ with torch.autograd.set_detect_anomaly(self._detect_anoma │
│ ❱ 978 │ │ │ │ self.fit_loop.run() │
│ 979 │ │ │ return None │
│ 980 │ │ raise RuntimeError(f"Unexpected state {self.state}") │
│ 981 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:2 │
│ 01 in run │
│ │
│ 198 │ │ while not self.done: │
│ 199 │ │ │ try: │
│ 200 │ │ │ │ self.on_advance_start() │
│ ❱ 201 │ │ │ │ self.advance() │
│ 202 │ │ │ │ self.on_advance_end() │
│ 203 │ │ │ │ self._restarting = False │
│ 204 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:3 │
│ 54 in advance │
│ │
│ 351 │ │ assert self._data_fetcher is not None │
│ 352 │ │ self._data_fetcher.setup(combined_loader) │
│ 353 │ │ with self.trainer.profiler.profile("run_training_epoch"): │
│ ❱ 354 │ │ │ self.epoch_loop.run(self._data_fetcher) │
│ 355 │ │
│ 356 │ def on_advance_end(self) -> None: │
│ 357 │ │ trainer = self.trainer │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:133 in run │
│ │
│ 130 │ │ self.on_run_start(data_fetcher) │
│ 131 │ │ while not self.done: │
│ 132 │ │ │ try: │
│ ❱ 133 │ │ │ │ self.advance(data_fetcher) │
│ 134 │ │ │ │ self.on_advance_end() │
│ 135 │ │ │ │ self._restarting = False │
│ 136 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:218 in advance │
│ │
│ 215 │ │ │ with trainer.profiler.profile("run_training_batch"): │
│ 216 │ │ │ │ if trainer.lightning_module.automatic_optimization: │
│ 217 │ │ │ │ │ # in automatic optimization, there can only be one │
│ ❱ 218 │ │ │ │ │ batch_output = self.automatic_optimization.run(tra │
│ 219 │ │ │ │ else: │
│ 220 │ │ │ │ │ batch_output = self.manual_optimization.run(kwargs │
│ 221 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:185 in run │
│ │
│ 182 │ │ # ------------------------------ │
│ 183 │ │ # gradient update with accumulated gradients │
│ 184 │ │ else: │
│ ❱ 185 │ │ │ self._optimizer_step(kwargs.get("batch_idx", 0), closure) │
│ 186 │ │ │
│ 187 │ │ result = closure.consume_result() │
│ 188 │ │ if result.loss is None: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:261 in _optimizer_step │
│ │
│ 258 │ │ │ self.optim_progress.optimizer.step.increment_ready() │
│ 259 │ │ │
│ 260 │ │ # model hook │
│ ❱ 261 │ │ call._call_lightning_module_hook( │
│ 262 │ │ │ trainer, │
│ 263 │ │ │ "optimizer_step", │
│ 264 │ │ │ trainer.current_epoch, │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:142 │
│ in _call_lightning_module_hook │
│ │
│ 139 │ pl_module._current_fx_name = hook_name │
│ 140 │ │
│ 141 │ with trainer.profiler.profile(f"[LightningModule]{pl_module.__clas │
│ ❱ 142 │ │ output = fn(*args, **kwargs) │
│ 143 │ │
│ 144 │ # restore current_fx when nested context │
│ 145 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/module.py:1265 │
│ in optimizer_step │
│ │
│ 1262 │ │ │ │ │ for pg in optimizer.param_groups: │
│ 1263 │ │ │ │ │ │ pg["lr"] = lr_scale * self.learning_rate │
│ 1264 │ │ """ │
│ ❱ 1265 │ │ optimizer.step(closure=optimizer_closure) │
│ 1266 │ │
│ 1267 │ def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimiz │
│ 1268 │ │ """Override this method to change the default behaviour of
│ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/optimizer.py:1 │ │ 58 in step │ │ │ │ 155 │ │ │ raise MisconfigurationException("When `optimizer.step(clos │ │ 156 │ │ │ │ 157 │ │ assert self._strategy is not None │ │ ❱ 158 │ │ step_output = self._strategy.optimizer_step(self._optimizer, c │ │ 159 │ │ │ │ 160 │ │ self._on_after_step() │ │ 161 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:2 │ │ 59 in optimizer_step │ │ │ │ 256 │ │ │ model: reference to the model, optionally defining optimiz │ │ 257 │ │ │ **kwargs: Any extra arguments to
optimizer.step`` ││ 258 │ │ """ │
│ ❱ 259 │ │ optimizer_output = super().optimizer_step(optimizer, closure, │
│ 260 │ │ │
│ 261 │ │ if self._model_averager is None: │
│ 262 │ │ │ return optimizer_output │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/strategy │
│ .py:224 in optimizer_step │
│ │
│ 221 │ │ model = model or self.lightning_module │
│ 222 │ │ # TODO(fabric): remove assertion once strategy's optimizer_ste │
│ 223 │ │ assert isinstance(model, pl.LightningModule) │
│ ❱ 224 │ │ return self.precision_plugin.optimizer_step(optimizer, model=m │
│ 225 │ │
│ 226 │ def _setup_model_and_optimizers(self, model: Module, optimizers: L │
│ 227 │ │ """Setup a model and multiple optimizers together. │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/plugins/precision/d │
│ eepspeed.py:92 in optimizer_step │
│ │
│ 89 │ ) -> Any: │
│ 90 │ │ if isinstance(optimizer, LBFGS): │
│ 91 │ │ │ raise MisconfigurationException("DeepSpeed and the LBFGS o │
│ ❱ 92 │ │ closure_result = closure() │
│ 93 │ │ self._after_closure(model, optimizer) │
│ 94 │ │ skipped_backward = closure_result is None │
│ 95 │ │ # in manual optimization, the closure does not return a value │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:140 in call │
│ │
│ 137 │ │ return step_output │
│ 138 │ │
│ 139 │ def call(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: │
│ ❱ 140 │ │ self._result = self.closure(*args, **kwargs) │
│ 141 │ │ return self._result.loss │
│ 142 │
│ 143 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:126 in closure │
│ │
│ 123 │ │ self._zero_grad_fn = zero_grad_fn │
│ 124 │ │
│ 125 │ def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: │
│ ❱ 126 │ │ step_output = self._step_fn() │
│ 127 │ │ │
│ 128 │ │ if step_output.closure_loss is None: │
│ 129 │ │ │ self.warning_cache.warn("
training_step
returned `None`. ││ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:308 in _training_step │
│ │
│ 305 │ │ trainer = self.trainer │
│ 306 │ │ │
│ 307 │ │ # manually capture logged metrics │
│ ❱ 308 │ │ training_step_output = call._call_strategy_hook(trainer, "trai │
│ 309 │ │ self.trainer.strategy.post_training_step() │
│ 310 │ │ │
│ 311 │ │ result = self.output_result_cls.from_training_step_output(trai │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:288 │
│ in _call_strategy_hook │
│ │
│ 285 │ │ return │
│ 286 │ │
│ 287 │ with trainer.profiler.profile(f"[Strategy]{trainer.strategy.__clas │
│ ❱ 288 │ │ output = fn(*args, **kwargs) │
│ 289 │ │
│ 290 │ # restore current_fx when nested context │
│ 291 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:3 │
│ 31 in training_step │
│ │
│ 328 │ def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: │
│ 329 │ │ assert self.model is not None │
│ 330 │ │ with self.precision_plugin.train_step_context(): │
│ ❱ 331 │ │ │ return self.model(*args, **kwargs) │
│ 332 │ │
│ 333 │ def validation_step(self, *args: Any, **kwargs: Any) -> Optional[S │
│ 334 │ │ with self.precision_plugin.val_step_context(): │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deepspeed/utils/nvtx.py:15 in │
│ wrapped_fn │
│ │
│ 12 │ │
│ 13 │ def wrapped_fn(*args, **kwargs): │
│ 14 │ │ get_accelerator().range_push(func.qualname) │
│ ❱ 15 │ │ ret_val = func(*args, **kwargs) │
│ 16 │ │ get_accelerator().range_pop() │
│ 17 │ │ return ret_val │
│ 18 │
│ │
│ /usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py:1769 in │
│ forward │
│ │
│ 1766 │ │ if self.fp16_auto_cast(): │
│ 1767 │ │ │ inputs = self._cast_inputs_half(inputs) │
│ 1768 │ │ │
│ ❱ 1769 │ │ loss = self.module(*inputs, **kwargs) │
│ 1770 │ │ │
│ 1771 │ │ if self.zero_optimization_partition_weights(): │
│ 1772 │ │ │ # Disable automated discovery of external parameters │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self.backward_hooks or global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/overrides/base.py:9 │
│ 0 in forward │
│ │
│ 87 │ │ │
│ 88 │ │ if trainer is not None: │
│ 89 │ │ │ if trainer.training: │
│ ❱ 90 │ │ │ │ output = self.forward_module.training_step(*inputs, * │
│ 91 │ │ │ │ # In manual_optimization, we need to prevent DDP reduc │
│ 92 │ │ │ │ # it is done manually in `LightningModule.manual_backw │
│ 93 │ │ │ │ # `require_backward_grad_sync` will be reset in the │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:552 in training_step │
│ │
│ 549 │ │
│ 550 │ def training_step(self, batch): │
│ 551 │ │ if isinstance(batch, dict): │
│ ❱ 552 │ │ │ outputs = self.compute_loss(**batch) │
│ 553 │ │ else: │
│ 554 │ │ │ outputs = self.compute_loss(**dict(batch)) │
│ 555 │ │ loss = outputs[0] │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:371 in compute_loss │
│ │
│ 368 │ def compute_loss(self, *args, **kwargs): │
│ 369 │ │ if len(args): │
│ 370 │ │ │ kwargs.update(dict(args)) │
│ ❱ 371 │ │ return self.model.compute_loss(**kwargs) │
│ 372 │ │
│ 373 │ def forward(self, *args, **kwargs): │
│ 374 │ │ if len(args): │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/transformer │
│ base.py:117 in compute_loss │
│ │
│ 114 │ │ return self.model(*args, **batch) │
│ 115 │ │
│ 116 │ def compute_loss(self, *args, **batch) -> tuple: │
│ ❱ 117 │ │ return self.model(*args, **batch) │
│ 118 │ │
│ 119 │ def post_init(self): │
│ 120 │ │ return self.model.post_init() │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:953 in forward │
│ │
│ 950 │ │ use_cache = use_cache if use_cache is not None else self.conf │
│ 951 │ │ return_dict = return_dict if return_dict is not None else sel │
│ 952 │ │ │
│ ❱ 953 │ │ transformer_outputs = self.transformer( │
│ 954 │ │ │ input_ids=input_ids, │
│ 955 │ │ │ position_ids=position_ids, │
│ 956 │ │ │ attention_mask=attention_mask, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:849 in forward │
│ │
│ 846 │ │ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() │
│ 847 │ │ │
│ 848 │ │ # Run encoder. │
│ ❱ 849 │ │ hidden_states, presents, all_hidden_states, all_self_attentio │
│ 850 │ │ │ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary │
│ 851 │ │ │ kv_caches=past_key_values, use_cache=use_cache, output_hi │
│ 852 │ │ ) │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:639 in forward │
│ │
│ 636 │ │ │ │
│ 637 │ │ │ layer = self._get_layer(index) │
│ 638 │ │ │ if self.gradient_checkpointing and self.training: │
│ ❱ 639 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │
│ 640 │ │ │ │ │ layer, │
│ 641 │ │ │ │ │ hidden_states, │
│ 642 │ │ │ │ │ attention_mask, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:235 in │
│ checkpoint │
│ │
│ 232 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(a │
│ 233 │ │
│ 234 │ if use_reentrant: │
│ ❱ 235 │ │ return CheckpointFunction.apply(function, preserve, *args) │
│ 236 │ else: │
│ 237 │ │ return _checkpoint_without_reentrant( │
│ 238 │ │ │ function, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:96 in │
│ forward │
│ │
│ 93 │ │ ctx.save_for_backward(*tensor_inputs) │
│ 94 │ │ │
│ 95 │ │ with torch.no_grad(): │
│ ❱ 96 │ │ │ outputs = run_function(*args) │
│ 97 │ │ return outputs │
│ 98 │ │
│ 99 │ @staticmethod │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:551 in forward │
│ │
│ 548 │ │ # hidden_states: [s, b, h] │
│ 549 │ │ │
│ 550 │ │ # Layer norm at the beginning of the transformer layer. │
│ ❱ 551 │ │ layernorm_output = self.input_layernorm(hidden_states) │
│ 552 │ │ # Self attention. │
│ 553 │ │ attention_output, kv_cache = self.self_attention( │
│ 554 │ │ │ layernorm_output, │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │
│ _call_impl │
│ │
│ 1127 │ │ # this function, and just call forward. │
│ 1128 │ │ if not (self._backward_hooks or self.forward_hooks or self. │
│ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
│ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │
│ 1131 │ │ # Do not call functions when jit is used │
│ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │
│ │
│ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │
│ new_forward │
│ │
│ 162 │ │ │ with torch.no_grad(): │
│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │
│ 166 │ │ return module._hf_hook.post_forward(module, output) │
│ 167 │ │
│ 168 │ module.forward = new_forward │
│ │
│ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │
│ eling_chatglm.py:201 in forward │
│ │
│ 198 │ │ variance = hidden_states.to(torch.float32).pow(2).mean(-1, ke │
│ 199 │ │ hidden_states = hidden_states * torch.rsqrt(variance + self.e │
│ 200 │ │ │
│ ❱ 201 │ │ return (self.weight * hidden_states).to(input_dtype) │
│ 202 │
│ 203 │
│ 204 class CoreAttention(torch.nn.Module): │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least
two devices, cuda:0 and cuda:1!
sft.config.py中的train_info_args如下:
train_info_args = {
'devices': 2,
'data_backend': 'parquet', #one of record lmdb arrow_stream ,arrow_file, parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
# 预训练模型路径
**train_model_config,
'convert_onnx': False, # 转换onnx模型
'do_train': True,
'train_file': [ '/chatglm2-dev/data/finetune_train_examples.json'],
'max_epochs': 20,
'max_steps': -1,
'optimizer': 'lion', # one of [lamb,adma,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit]
}
main.py的信息如下:
模块配置, 默认启用lora
enable_deepspeed = True
enable_ptv2 = False
enable_lora = True
load_in_bit = 0 # 4 load_in_4bit, 8 load_in_8bit other 0
if enable_lora:
from config.sft_config_lora import *
elif enable_ptv2:
from config.sft_config_ptv2 import *
else:
from config.sft_config import *
if enable_lora:
enable_ptv2 = False
global_args['load_in_4bit'] = load_in_bit == 4
global_args['load_in_8bit'] = load_in_bit == 8
elif enable_ptv2:
enable_lora = False
global_args['load_in_4bit'] = False
global_args['load_in_8bit'] = False
train_info_args.pop('lora', None)
train_info_args.pop('adalora', None)
else:
enable_ptv2 = False
enable_lora = False
# global_args['load_in_4bit'] = False
# global_args['load_in_8bit'] = False
train_info_args.pop('lora',None)
train_info_args.pop('adalora', None)
train_info_args.pop('prompt', None)
#预处理
if 'rwkv' in train_info_args['tokenizer_name'].lower():
train_info_args['use_fast_tokenizer'] = True
def get_deepspeed_config():
'''
lora prompt finetuning 使用 deepspeed_offload.json
普通finetuning 使用deepspeed.json
'''
# 是否开启deepspeed
if not enable_deepspeed:
return None
The text was updated successfully, but these errors were encountered: