Skip to content

Commit

Permalink
Merge branch 'main' into chatbot-universal-guide
Browse files Browse the repository at this point in the history
  • Loading branch information
ganochenkodg authored Jun 12, 2024
2 parents 493f55a + 5254bf5 commit 0ec4ec2
Show file tree
Hide file tree
Showing 43 changed files with 673 additions and 360 deletions.
21 changes: 21 additions & 0 deletions .github/workflows/ai-ml-gke-finetuning-gemma.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
name: ai-ml-finetune-gemma
on:
push:
branches:
- main
paths:
- '.github/workflows/ai-ml-gke-finetuning-gemma.yml'
- 'ai-ml/llm-finetuning-gemma/**'
pull_request:
paths:
- '.github/workflows/ai-ml-gke-finetuning-gemma.yml'
- 'ai-ml/llm-finetuning-gemma/**'
jobs:
gke-a100-jax:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: build container for llm-finetuning-gemma tutorial
run: |
cd ai-ml/llm-finetuning-gemma/
docker build --tag finetune .
2 changes: 1 addition & 1 deletion ai-ml/gke-ray/gke-platform/modules/kuberay/kuberay.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ resource "helm_release" "kuberay-operator" {
repository = "https://ray-project.github.io/kuberay-helm/"
chart = "kuberay-operator"
values = var.enable_autopilot ? [file("${path.module}/kuberay-operator-autopilot-values.yaml")] : [file("${path.module}/kuberay-operator-values.yaml")]
version = "1.1.0"
version = "1.1.1"
namespace = "${kubernetes_namespace.ray_namespace.metadata[0].name}"
}
2 changes: 1 addition & 1 deletion ai-ml/gke-ray/gke-platform/modules/kuberay/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ terraform {
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "2.29.0"
version = "2.30.0"
}
}
provider_meta "google" {
Expand Down
2 changes: 1 addition & 1 deletion ai-ml/gke-ray/gke-platform/modules/kubernetes/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ terraform {
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "2.29.0"
version = "2.30.0"
}
kubectl = {
source = "alekc/kubectl"
Expand Down
2 changes: 1 addition & 1 deletion ai-ml/gke-ray/gke-platform/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ terraform {
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "2.29.0"
version = "2.30.0"
}
kubectl = {
source = "alekc/kubectl"
Expand Down
32 changes: 32 additions & 0 deletions ai-ml/llm-finetuning-gemma/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START gke_aiml_llm_finetune_gemma_single_node_docker]
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04

RUN apt-get update && \
apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \
rm -rf /var/lib/apt/lists/*

RUN pip3 install --no-cache-dir \
accelerate==0.30.1 bitsandbytes==0.43.1 \
datasets==2.19.1 transformers==4.41.0 \
peft==0.11.1 trl==0.8.6 torch==2.3.0

COPY finetune.py /finetune.py

ENV PYTHONUNBUFFERED 1

CMD python3 /finetune.py --device cuda
# [END gke_aiml_llm_finetune_gemma_single_node_docker]
3 changes: 3 additions & 0 deletions ai-ml/llm-finetuning-gemma/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Finetune an LLM with multiple GPUs in GKE samples

TBD
251 changes: 251 additions & 0 deletions ai-ml/llm-finetuning-gemma/finetune.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START gke_aiml_llm_finetune_gemma_single_node_py]
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
TrainingArguments,
pipeline,
logging,
)
from peft import LoraConfig, PeftModel

from trl import SFTTrainer

# The model that you want to train from the Hugging Face hub
model_name = os.getenv("MODEL_NAME", "google/gemma-2b")

# The instruction dataset to use
dataset_name = "b-mc2/sql-create-context"

# Fine-tuned model name
new_model = os.getenv("NEW_MODEL", "gemma-2b-sql")

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = int(os.getenv("LORA_R", "4"))

# Alpha parameter for LoRA scaling
lora_alpha = int(os.getenv("LORA_ALPHA", "8"))

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = int(os.getenv("TRAIN_BATCH_SIZE", "1"))

# Batch size per GPU for evaluation
per_device_eval_batch_size = int(os.getenv("EVAL_BATCH_SIZE", "2"))

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = int(os.getenv("GRADIENT_ACCUMULATION_STEPS", "1"))

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = int(os.getenv("LOGGING_STEPS", "50"))

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = int(os.getenv("MAX_SEQ_LENGTH", "512"))

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {'':torch.cuda.current_device()}

# Set limit to a positive number
limit = int(os.getenv("DATASET_LIMIT", "5000"))

dataset = load_dataset(dataset_name, split="train")
if limit != -1:
dataset = dataset.shuffle(seed=42).select(range(limit))


def transform(data):
question = data['question']
context = data['context']
answer = data['answer']
template = "Question: {question}\nContext: {context}\nAnswer: {answer}"
return {'text': template.format(question=question, context=context, answer=answer)}


transformed = dataset.map(transform)

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16")
print("=" * 80)

# Load base model
# model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map=device_map,
torch_dtype=torch.float16,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "v_proj"]
)

# Set training parameters
training_arguments = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
weight_decay=weight_decay,
fp16=fp16,
bf16=bf16,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=group_by_length,
lr_scheduler_type=lr_scheduler_type,
)

trainer = SFTTrainer(
model=model,
train_dataset=transformed,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=packing,
)

trainer.train()

trainer.model.save_pretrained(new_model)

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

model.push_to_hub(new_model, check_pr=True)

tokenizer.push_to_hub(new_model, check_pr=True)
# [END gke_aiml_llm_finetune_gemma_single_node_py]
Loading

0 comments on commit 0ec4ec2

Please sign in to comment.