Merge branch 'main' into chatbot-universal-guide

GoogleCloudPlatform · Jun 12, 2024 · 0ec4ec2 · 0ec4ec2
2 parents 493f55a + 5254bf5
commit 0ec4ec2
Show file tree

Hide file tree

Showing 43 changed files with 673 additions and 360 deletions.
diff --git a/.github/workflows/ai-ml-gke-finetuning-gemma.yml b/.github/workflows/ai-ml-gke-finetuning-gemma.yml
@@ -0,0 +1,21 @@
+name: ai-ml-finetune-gemma
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - '.github/workflows/ai-ml-gke-finetuning-gemma.yml'
+      - 'ai-ml/llm-finetuning-gemma/**'
+  pull_request:
+    paths:
+      - '.github/workflows/ai-ml-gke-finetuning-gemma.yml'
+      - 'ai-ml/llm-finetuning-gemma/**'
+jobs:
+  gke-a100-jax:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: build container for llm-finetuning-gemma tutorial
+        run: |
+          cd ai-ml/llm-finetuning-gemma/
+          docker build --tag finetune .
diff --git a/ai-ml/gke-ray/gke-platform/modules/kuberay/kuberay.tf b/ai-ml/gke-ray/gke-platform/modules/kuberay/kuberay.tf
@@ -23,6 +23,6 @@ resource "helm_release" "kuberay-operator" {
   repository = "https://ray-project.github.io/kuberay-helm/"
   chart      = "kuberay-operator"
   values     = var.enable_autopilot ? [file("${path.module}/kuberay-operator-autopilot-values.yaml")] : [file("${path.module}/kuberay-operator-values.yaml")]
-  version    = "1.1.0"
+  version    = "1.1.1"
   namespace  = "${kubernetes_namespace.ray_namespace.metadata[0].name}"
 }
diff --git a/ai-ml/gke-ray/gke-platform/modules/kuberay/versions.tf b/ai-ml/gke-ray/gke-platform/modules/kuberay/versions.tf
@@ -20,7 +20,7 @@ terraform {
     }
     kubernetes = {
       source  = "hashicorp/kubernetes"
-      version = "2.29.0"
+      version = "2.30.0"
     }
   }
   provider_meta "google" {

diff --git a/ai-ml/gke-ray/gke-platform/modules/kubernetes/versions.tf b/ai-ml/gke-ray/gke-platform/modules/kubernetes/versions.tf
@@ -20,7 +20,7 @@ terraform {
     }
     kubernetes = {
       source  = "hashicorp/kubernetes"
-      version = "2.29.0"
+      version = "2.30.0"
     }
     kubectl = {
       source  = "alekc/kubectl"

diff --git a/ai-ml/gke-ray/gke-platform/versions.tf b/ai-ml/gke-ray/gke-platform/versions.tf
@@ -27,7 +27,7 @@ terraform {
     }
     kubernetes = {
       source  = "hashicorp/kubernetes"
-      version = "2.29.0"
+      version = "2.30.0"
     }
     kubectl = {
       source  = "alekc/kubectl"

diff --git a/ai-ml/llm-finetuning-gemma/Dockerfile b/ai-ml/llm-finetuning-gemma/Dockerfile
@@ -0,0 +1,32 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_aiml_llm_finetune_gemma_single_node_docker]
+FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04
+
+RUN apt-get update && \
+    apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --no-cache-dir \
+    accelerate==0.30.1 bitsandbytes==0.43.1 \
+    datasets==2.19.1 transformers==4.41.0 \
+    peft==0.11.1 trl==0.8.6 torch==2.3.0
+
+COPY finetune.py /finetune.py
+
+ENV PYTHONUNBUFFERED 1
+
+CMD python3 /finetune.py --device cuda
+# [END gke_aiml_llm_finetune_gemma_single_node_docker]
diff --git a/ai-ml/llm-finetuning-gemma/README.md b/ai-ml/llm-finetuning-gemma/README.md
@@ -0,0 +1,3 @@
+# Finetune an LLM with multiple GPUs in GKE samples
+
+TBD
diff --git a/ai-ml/llm-finetuning-gemma/finetune.py b/ai-ml/llm-finetuning-gemma/finetune.py
@@ -0,0 +1,251 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_aiml_llm_finetune_gemma_single_node_py]
+import os
+import torch
+from datasets import load_dataset, Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    HfArgumentParser,
+    TrainingArguments,
+    pipeline,
+    logging,
+)
+from peft import LoraConfig, PeftModel
+
+from trl import SFTTrainer
+
+# The model that you want to train from the Hugging Face hub
+model_name = os.getenv("MODEL_NAME", "google/gemma-2b")
+
+# The instruction dataset to use
+dataset_name = "b-mc2/sql-create-context"
+
+# Fine-tuned model name
+new_model = os.getenv("NEW_MODEL", "gemma-2b-sql")
+
+################################################################################
+# QLoRA parameters
+################################################################################
+
+# LoRA attention dimension
+lora_r = int(os.getenv("LORA_R", "4"))
+
+# Alpha parameter for LoRA scaling
+lora_alpha = int(os.getenv("LORA_ALPHA", "8"))
+
+# Dropout probability for LoRA layers
+lora_dropout = 0.1
+
+################################################################################
+# bitsandbytes parameters
+################################################################################
+
+# Activate 4-bit precision base model loading
+use_4bit = True
+
+# Compute dtype for 4-bit base models
+bnb_4bit_compute_dtype = "float16"
+
+# Quantization type (fp4 or nf4)
+bnb_4bit_quant_type = "nf4"
+
+# Activate nested quantization for 4-bit base models (double quantization)
+use_nested_quant = False
+
+################################################################################
+# TrainingArguments parameters
+################################################################################
+
+# Output directory where the model predictions and checkpoints will be stored
+output_dir = "./results"
+
+# Number of training epochs
+num_train_epochs = 1
+
+# Enable fp16/bf16 training (set bf16 to True with an A100)
+fp16 = True
+bf16 = False
+
+# Batch size per GPU for training
+per_device_train_batch_size = int(os.getenv("TRAIN_BATCH_SIZE", "1"))
+
+# Batch size per GPU for evaluation
+per_device_eval_batch_size = int(os.getenv("EVAL_BATCH_SIZE", "2"))
+
+# Number of update steps to accumulate the gradients for
+gradient_accumulation_steps = int(os.getenv("GRADIENT_ACCUMULATION_STEPS", "1"))
+
+# Enable gradient checkpointing
+gradient_checkpointing = True
+
+# Maximum gradient normal (gradient clipping)
+max_grad_norm = 0.3
+
+# Initial learning rate (AdamW optimizer)
+learning_rate = 2e-4
+
+# Weight decay to apply to all layers except bias/LayerNorm weights
+weight_decay = 0.001
+
+# Optimizer to use
+optim = "paged_adamw_32bit"
+
+# Learning rate schedule
+lr_scheduler_type = "cosine"
+
+# Number of training steps (overrides num_train_epochs)
+max_steps = -1
+
+# Ratio of steps for a linear warmup (from 0 to learning rate)
+warmup_ratio = 0.03
+
+# Group sequences into batches with same length
+# Saves memory and speeds up training considerably
+group_by_length = True
+
+# Save checkpoint every X updates steps
+save_steps = 0
+
+# Log every X updates steps
+logging_steps = int(os.getenv("LOGGING_STEPS", "50"))
+
+################################################################################
+# SFT parameters
+################################################################################
+
+# Maximum sequence length to use
+max_seq_length = int(os.getenv("MAX_SEQ_LENGTH", "512"))
+
+# Pack multiple short examples in the same input sequence to increase efficiency
+packing = False
+
+# Load the entire model on the GPU 0
+device_map = {'':torch.cuda.current_device()}
+
+# Set limit to a positive number
+limit = int(os.getenv("DATASET_LIMIT", "5000"))
+
+dataset = load_dataset(dataset_name, split="train")
+if limit != -1:
+    dataset = dataset.shuffle(seed=42).select(range(limit))
+
+
+def transform(data):
+    question = data['question']
+    context = data['context']
+    answer = data['answer']
+    template = "Question: {question}\nContext: {context}\nAnswer: {answer}"
+    return {'text': template.format(question=question, context=context, answer=answer)}
+
+
+transformed = dataset.map(transform)
+
+# Load tokenizer and model with QLoRA configuration
+compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=use_4bit,
+    bnb_4bit_quant_type=bnb_4bit_quant_type,
+    bnb_4bit_compute_dtype=compute_dtype,
+    bnb_4bit_use_double_quant=use_nested_quant,
+)
+
+# Check GPU compatibility with bfloat16
+if compute_dtype == torch.float16 and use_4bit:
+    major, _ = torch.cuda.get_device_capability()
+    if major >= 8:
+        print("=" * 80)
+        print("Your GPU supports bfloat16")
+        print("=" * 80)
+
+# Load base model
+# model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map=device_map,
+    torch_dtype=torch.float16,
+)
+model.config.use_cache = False
+model.config.pretraining_tp = 1
+
+# Load LLaMA tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
+
+# Load LoRA configuration
+peft_config = LoraConfig(
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    r=lora_r,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=["q_proj", "v_proj"]
+)
+
+# Set training parameters
+training_arguments = TrainingArguments(
+    output_dir=output_dir,
+    num_train_epochs=num_train_epochs,
+    per_device_train_batch_size=per_device_train_batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    optim=optim,
+    save_steps=save_steps,
+    logging_steps=logging_steps,
+    learning_rate=learning_rate,
+    weight_decay=weight_decay,
+    fp16=fp16,
+    bf16=bf16,
+    max_grad_norm=max_grad_norm,
+    max_steps=max_steps,
+    warmup_ratio=warmup_ratio,
+    group_by_length=group_by_length,
+    lr_scheduler_type=lr_scheduler_type,
+)
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=transformed,
+    peft_config=peft_config,
+    dataset_text_field="text",
+    max_seq_length=max_seq_length,
+    tokenizer=tokenizer,
+    args=training_arguments,
+    packing=packing,
+)
+
+trainer.train()
+
+trainer.model.save_pretrained(new_model)
+
+# Reload model in FP16 and merge it with LoRA weights
+base_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    low_cpu_mem_usage=True,
+    return_dict=True,
+    torch_dtype=torch.float16,
+    device_map=device_map,
+)
+model = PeftModel.from_pretrained(base_model, new_model)
+model = model.merge_and_unload()
+
+model.push_to_hub(new_model, check_pr=True)
+
+tokenizer.push_to_hub(new_model, check_pr=True)
+# [END gke_aiml_llm_finetune_gemma_single_node_py]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Finetune an LLM with multiple GPUs in GKE samples

		TBD