Skip to content

Commit

Permalink
Merge pull request #65 from ssbuild/dev
Browse files Browse the repository at this point in the history
fix hf and acc bug
  • Loading branch information
ssbuild authored Oct 9, 2023
2 parents 6cfa8b0 + a9097ef commit 1073340
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 19 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pip install -U git+https://github.com/ssbuild/deep_training.git --no-deps --forc
- <strong>2023-10-07</strong>
- 0.2.5 support colossalai 训练 ,策略 ddp ,gemini,gemini_auto,zero2,zero2_cpu,3d
- 0.2.5.post0 fix model deepcopy
- 0.2.5.post1 support accelerator 训练
- 0.2.5.post2 support accelerator 训练 , fix some bug in accelerator and hf trainer

- <strong>2023-09-26</strong>
- 0.2.4 support transformers trainer and qwen-7b 新版 和 qwen-14b , 旧版不再支持,旧版可以安装 deep_training <= 0.2.3
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
]
setup(
name='deep_training',
version='0.2.5.post1',
version='0.2.5.post2',
description='an easy training architecture',
long_description='torch_training: https://github.com/ssbuild/deep_training.git',
license='Apache License 2.0',
Expand Down
2 changes: 2 additions & 0 deletions src/deep_training/nlp/optimizer/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ def get_optimizer_cls_and_kwargs(optimizer_name,args) -> typing.Tuple[typing.Any
raise ValueError("Please install https://github.com/pytorch/torchdistx")
elif optimizer_name == OptimizerNames.SGD:
optimizer_cls = torch.optim.SGD
elif optimizer_name == OptimizerNames.RMSPROP:
optimizer_cls = torch.optim.RMSprop
elif optimizer_name == OptimizerNames.ADAGRAD:
optimizer_cls = torch.optim.Adagrad
else:
Expand Down
29 changes: 14 additions & 15 deletions src/deep_training/trainer/ac/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from pathlib import Path
from typing import Union, Optional, Callable, List, Tuple, Dict, Any
import numpy as np
import accelerate
from accelerate import Accelerator, DistributedType
from accelerate.checkpointing import save_accelerator_state, save_custom_state
from accelerate.utils import GradientAccumulationPlugin, is_deepspeed_available
Expand Down Expand Up @@ -571,8 +570,18 @@ def train(self,start_epoch=0,start_step=0, trial: Union["optuna.Trial", Dict[str
def training_step(self, model: nn.Module, inputs: Dict[ str, Union[ torch.Tensor, Any ] ]) -> torch.Tensor:
device = torch.cuda.current_device()
batch = {k: v.to(device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
loss = model(**batch)
return loss
loss_obj = model(**batch)

if isinstance(loss_obj, (list, tuple)):
loss_obj = loss_obj[ 0 ]

if isinstance(loss_obj, dict):
tr_loss_step = loss_obj[ "loss" ]
else:
tr_loss_step = loss_obj

self.accelerator.backward(loss=tr_loss_step)
return tr_loss_step.detach() / self.args.gradient_accumulation_steps

def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
"""
Expand Down Expand Up @@ -729,6 +738,7 @@ def _train_loop(self,start_epoch=0,start_step=0,
)
self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
step = -1
model.train()
with tqdm(
iterable=enumerate(train_dataloader, start=start_step),
desc=f"Epoch {epoch}",
Expand All @@ -741,18 +751,7 @@ def _train_loop(self,start_epoch=0,start_step=0,
if step % args.gradient_accumulation_steps == 0:
self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
with self.accelerator.accumulate(model):
loss_obj = self.training_step(model, batch)
if isinstance(loss_obj, (list, tuple)):
loss_obj = loss_obj[0]

if isinstance(loss_obj, dict):
tr_loss_step = loss_obj["loss"]
else:
tr_loss_step = loss_obj

self.accelerator.backward(loss=tr_loss_step)

tr_loss_step = tr_loss_step / args.gradient_accumulation_steps
tr_loss_step = self.training_step(model, batch)

if (
args.logging_nan_inf_filter
Expand Down
5 changes: 4 additions & 1 deletion src/deep_training/trainer/cl/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from typing import Union, Optional, Callable, List, Tuple, Dict, Any

import numpy as np
from lightning_utilities import apply_to_collection
from packaging import version
from datasets import Dataset
from peft import PeftModel
Expand Down Expand Up @@ -756,9 +757,11 @@ def _train_loop(self,start_epoch=0,start_step=0,

booster.backward(loss=loss, optimizer=optimizer)

all_reduce_mean(tensor=loss)

loss_obj = apply_to_collection(loss_obj, dtype=torch.Tensor, function=lambda x: x.detach())
loss = loss.detach()

all_reduce_mean(tensor=loss)
pbar.set_postfix({"Loss": f"{loss.item():.4f}"})
if coordinator.is_master():
global_step = epoch * num_steps_per_epoch + step
Expand Down
2 changes: 1 addition & 1 deletion src/deep_training/trainer/hf/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop("labels")
else:
labels = None
outputs = model.compute_loss(**inputs)
outputs = model(**inputs)
# Save past state if it exists
# TODO: this needs to be fixed and made cleaner later.
if self.args.past_index >= 0:
Expand Down

0 comments on commit 1073340

Please sign in to comment.