-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add CLI sample for text-gen * Add cli sample for textgen * resolve comments and fix black formatting issues
- Loading branch information
1 parent
280f7a9
commit fbfe7fc
Showing
4 changed files
with
371 additions
and
0 deletions.
There are no files selected for viewing
9 changes: 9 additions & 0 deletions
9
cli/foundation-models/system/finetune/text-generation/deploy.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json | ||
name: demo | ||
instance_type: Standard_E64S_v3 | ||
instance_count: 1 | ||
liveness_probe: | ||
initial_delay: 600 | ||
request_settings: | ||
request_timeout_ms: 90000 | ||
|
84 changes: 84 additions & 0 deletions
84
cli/foundation-models/system/finetune/text-generation/download-dataset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# import library to parse command line arguments | ||
import argparse, os | ||
import json | ||
|
||
parser = argparse.ArgumentParser() | ||
# add an argument to specify a dataset name to download | ||
parser.add_argument("--dataset", type=str, default="samsum", help="dataset name") | ||
# add an argument to specify a dataset name to download | ||
parser.add_argument( | ||
"--dataset_subset", type=str, default="split", help="dataset subset name" | ||
) | ||
# add an argument to specify the directory to download the dataset to | ||
parser.add_argument( | ||
"--download_dir", | ||
type=str, | ||
default="data", | ||
help="directory to download the dataset to", | ||
) | ||
args = parser.parse_args() | ||
|
||
# create the download directory if it does not exist | ||
if not os.path.exists(args.download_dir): | ||
os.makedirs(args.download_dir) | ||
|
||
|
||
# import hugging face datasets library | ||
from datasets import load_dataset, get_dataset_split_names | ||
|
||
for split in get_dataset_split_names(args.dataset): | ||
# load the split of the dataset | ||
dataset = load_dataset(args.dataset, split=split) | ||
# save the split of the dataset to the download directory as json lines file | ||
dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) | ||
# print dataset features | ||
|
||
|
||
# preprocess the dataset | ||
import pandas as pd | ||
|
||
|
||
def get_preprocessed_samsum(df): | ||
prompt = f"Summarize this dialog:\n{{}}\n---\nSummary:\n" | ||
|
||
df["text"] = df["dialogue"].map(prompt.format) | ||
df = df.drop(columns=["dialogue", "id"]) | ||
df = df[["text", "summary"]] | ||
|
||
return df | ||
|
||
|
||
test_df = pd.read_json("./samsum-dataset/test.jsonl", lines=True) | ||
train_df = pd.read_json("./samsum-dataset/train.jsonl", lines=True) | ||
validation_df = pd.read_json("./samsum-dataset/validation.jsonl", lines=True) | ||
# map the train, validation and test dataframes to preprocess function | ||
train_df = get_preprocessed_samsum(train_df) | ||
validation_df = get_preprocessed_samsum(validation_df) | ||
test_df = get_preprocessed_samsum(test_df) | ||
|
||
# Save the preprocessed data | ||
frac = 1 | ||
train_df.sample(frac=frac).to_json( | ||
"./samsum-dataset/small_train.jsonl", orient="records", lines=True | ||
) | ||
validation_df.sample(frac=frac).to_json( | ||
"./samsum-dataset/small_validation.jsonl", orient="records", lines=True | ||
) | ||
test_df.sample(frac=frac).to_json( | ||
"./samsum-dataset/small_test.jsonl", orient="records", lines=True | ||
) | ||
|
||
|
||
# read ./samsum-dataset/small_test.jsonl into a pandas dataframe | ||
test_df = pd.read_json("./samsum-dataset/small_test.jsonl", lines=True) | ||
# take 2 random samples | ||
test_df = test_df.sample(n=2) | ||
# rebuild index | ||
test_df.reset_index(drop=True, inplace=True) | ||
test_df.head(2) | ||
|
||
# create a json object with the key as "input_data" and value as a list of values from the text column of the test dataframe | ||
test_json = {"input_data": {"text": list(test_df["text"])}} | ||
# save the json object to a file named sample_score.json in the ./samsum-dataset folder | ||
with open("./samsum-dataset/sample_score.json", "w") as f: | ||
json.dump(test_json, f) |
87 changes: 87 additions & 0 deletions
87
cli/foundation-models/system/finetune/text-generation/text-generation-pipeline.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json | ||
type: pipeline | ||
|
||
experiment_name: text-generation-samsum | ||
|
||
inputs: | ||
compute_model_import: gpu-cluster-big | ||
compute_preprocess: gpu-cluster-big | ||
compute_finetune: gpu-cluster-big | ||
compute_model_evaluation: gpu-cluster-big | ||
|
||
# specify the foundation model available in the azureml system registry | ||
mlflow_model_path: | ||
path: azureml://registries/azureml/models/Llama-2-7b/versions/5 | ||
|
||
# huggingface_id: 'gpt2' # if you want to use a huggingface model, uncomment this line and comment the above lines | ||
|
||
# map the dataset files to parameters | ||
train_file_path: | ||
type: uri_file | ||
path: "samsum-dataset/small_train.jsonl" | ||
validation_file_path: | ||
type: uri_file | ||
path: "samsum-dataset/small_validation.jsonl" | ||
test_file_path: | ||
type: uri_file | ||
path: "samsum-dataset/small_test.jsonl" | ||
|
||
|
||
# The following parameters map to the dataset fields | ||
text_key: "text" | ||
ground_truth_key: "summary" | ||
|
||
# training settings | ||
number_of_gpu_to_use_finetuning: 8 | ||
num_train_epochs: 3 | ||
per_device_train_batch_size: 1 | ||
per_device_eval_batch_size: 1 | ||
learning_rate: 2e-5 | ||
|
||
# optimization params | ||
apply_lora: "true" | ||
apply_deepspeed: "true" | ||
apply_ort: "true" | ||
precision: 16 | ||
|
||
outputs: | ||
# map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model | ||
# registering the model is required to deploy the model to an online or batch endpoint | ||
trained_model: | ||
type: mlflow_model | ||
|
||
settings: | ||
force_rerun: true | ||
continue_on_step_failure: false | ||
|
||
jobs: | ||
text_generation_pipeline: | ||
type: pipeline | ||
component: azureml://registries/azureml/components/text_generation_pipeline/labels/latest | ||
inputs: | ||
mlflow_model_path: ${{parent.inputs.mlflow_model_path}} | ||
|
||
compute_model_import: ${{parent.inputs.compute_model_import}} | ||
compute_preprocess: ${{parent.inputs.compute_preprocess}} | ||
compute_finetune: ${{parent.inputs.compute_finetune}} | ||
compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}} | ||
|
||
train_file_path: ${{parent.inputs.train_file_path}} | ||
validation_file_path: ${{parent.inputs.validation_file_path}} | ||
test_file_path: ${{parent.inputs.test_file_path}} | ||
|
||
text_key: ${{parent.inputs.text_key}} | ||
ground_truth_key: ${{parent.inputs.ground_truth_key}} | ||
|
||
number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}} | ||
num_train_epochs: ${{parent.inputs.num_train_epochs}} | ||
per_device_train_batch_size: ${{parent.inputs.per_device_train_batch_size}} | ||
per_device_eval_batch_size: ${{parent.inputs.per_device_eval_batch_size}} | ||
learning_rate: ${{parent.inputs.learning_rate}} | ||
apply_lora: ${{parent.inputs.apply_lora}} | ||
apply_deepspeed: ${{parent.inputs.apply_deepspeed}} | ||
apply_ort: ${{parent.inputs.apply_ort}} | ||
precision: ${{parent.inputs.precision}} | ||
|
||
outputs: | ||
mlflow_model_folder: ${{parent.outputs.trained_model}} |
191 changes: 191 additions & 0 deletions
191
cli/foundation-models/system/finetune/text-generation/text-generation.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
set -x | ||
# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-text-generation | ||
# the data files are available in the same folder as the above notebook | ||
|
||
# script inputs | ||
subscription_id="<SUBSCRIPTION_ID>" | ||
resource_group_name="<RESOURCE_GROUP>" | ||
workspace_name="WORKSPACE_NAME>" | ||
registry_name="azureml" | ||
model_registry_name="azureml-meta" | ||
|
||
compute_cluster="gpu-cluster-big" | ||
# if above compute cluster does not exist, create it with the following vm size | ||
compute_sku="Standard_ND40rs_v2" | ||
# This is the number of GPUs in a single node of the selected 'vm_size' compute. | ||
# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. | ||
# Setting this to more than the number of GPUs will result in an error. | ||
# CLI command to fetch the number of GPUs | ||
# az ml compute list-sizes -g <resource_group_name> -w <workspace_name> --subscription <subscription_id> --query "[?name=='<compute_sku>'].{Name:name, Gpus:gpus}" --output table | ||
gpus_per_node=8 | ||
# This is the foundation model for finetuning | ||
model_name="Llama-2-7b" | ||
# using the latest version of the model - not working yet | ||
model_version=5 | ||
|
||
version=$(date +%s) | ||
finetuned_model_name=$model_name"-text-generation" | ||
endpoint_name="samsum-$version" | ||
deployment_sku="Standard_E64S_v3" | ||
|
||
|
||
# training data | ||
train_data="samsum-dataset/small_train.jsonl" | ||
# validation data | ||
validation_data="samsum-dataset/small_validation.jsonl" | ||
# test data | ||
test_data="samsum-dataset/small_test.jsonl" | ||
# scoring_file | ||
scoring_file="samsum-dataset/sample_score.json" | ||
|
||
# finetuning job parameters | ||
finetuning_pipeline_component="text_generation_pipeline" | ||
# The following parameters map to the dataset fields | ||
text_key="text" | ||
ground_truth_key="summary" | ||
# Training settings | ||
number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute | ||
num_train_epochs=3 | ||
per_device_train_batch_size=1 | ||
per_device_eval_batch_size=1 | ||
learning_rate=2e-5 | ||
|
||
# optimization params | ||
apply_lora="true" | ||
apply_deepspeed="false" | ||
apply_ort="true" | ||
precision=4 | ||
|
||
# 1. Setup pre-requisites | ||
|
||
if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \ | ||
[ "$resource_group_name" = "<RESOURCE_GROUP>" ] || \ | ||
[ "$workspace_name" = "<WORKSPACE_NAME>" ]; then | ||
echo "Please update the script with the subscription_id, resource_group_name and workspace_name" | ||
exit 1 | ||
fi | ||
|
||
az account set -s $subscription_id | ||
workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" | ||
|
||
# check if $compute_cluster exists, else create it | ||
if az ml compute show --name $compute_cluster $workspace_info | ||
then | ||
echo "Compute cluster $compute_cluster already exists" | ||
else | ||
echo "Creating compute cluster $compute_cluster" | ||
az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || { | ||
echo "Failed to create compute cluster $compute_cluster" | ||
exit 1 | ||
} | ||
fi | ||
|
||
# download the dataset | ||
|
||
python ./download-dataset.py --download_dir samsum-dataset || { | ||
echo "Failed to download dataset" | ||
exit 1 | ||
} | ||
|
||
# 2. Check if the model exists in the registry | ||
# need to confirm model show command works for registries outside the tenant (aka system registry) | ||
if ! az ml model show --name $model_name --version $model_version --registry-name $model_registry_name | ||
then | ||
echo "Model $model_name:$model_version does not exist in registry $model_registry_name" | ||
exit 1 | ||
fi | ||
|
||
# 3. Check if training data, validation data and test data exist | ||
if [ ! -f $train_data ]; then | ||
echo "Training data $train_data does not exist" | ||
exit 1 | ||
fi | ||
if [ ! -f $validation_data ]; then | ||
echo "Validation data $validation_data does not exist" | ||
exit 1 | ||
fi | ||
if [ ! -f $test_data ]; then | ||
echo "Test data $test_data does not exist" | ||
exit 1 | ||
fi | ||
|
||
# 4. Submit finetuning job using pipeline.yml | ||
|
||
# check if the finetuning pipeline component exists | ||
if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name | ||
then | ||
echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" | ||
exit 1 | ||
fi | ||
|
||
# need to switch to using latest version for model, currently blocked with a bug. | ||
# submit finetuning job | ||
parent_job_name=$( az ml job create --file ./text-generation-pipeline.yml $workspace_info --query name -o tsv --set \ | ||
jobs.text_generation_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ | ||
inputs.compute_model_import=$compute_cluster \ | ||
inputs.compute_preprocess=$compute_cluster \ | ||
inputs.compute_finetune=$compute_cluster \ | ||
inputs.compute_model_evaluation=$compute_cluster \ | ||
inputs.mlflow_model_path.path="azureml://registries/$model_registry_name/models/$model_name/versions/$model_version" \ | ||
inputs.train_file_path.path=$train_data \ | ||
inputs.validation_file_path.path=$validation_data \ | ||
inputs.test_file_path.path=$test_data \ | ||
inputs.text_key=$text_key \ | ||
inputs.ground_truth_key=$ground_truth_key \ | ||
inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \ | ||
inputs.num_train_epochs=$num_train_epochs \ | ||
inputs.per_device_train_batch_size=$per_device_train_batch_size \ | ||
inputs.per_device_eval_batch_size=$per_device_eval_batch_size \ | ||
inputs.learning_rate=$learning_rate \ | ||
inputs.apply_lora=$apply_lora \ | ||
inputs.apply_deepspeed=$apply_deepspeed \ | ||
inputs.apply_ort=$apply_ort \ | ||
inputs.precision=$precision ) || { | ||
echo "Failed to submit finetuning job" | ||
exit 1 | ||
} | ||
|
||
az ml job stream --name $parent_job_name $workspace_info || { | ||
echo "job stream failed"; exit 1; | ||
} | ||
|
||
# 5. Create model in workspace from train job output | ||
az ml model create --name $finetuned_model_name --version $version --type mlflow_model \ | ||
--path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || { | ||
echo "model create in workspace failed"; exit 1; | ||
} | ||
|
||
# 6. Deploy the model to an endpoint | ||
# create online endpoint | ||
az ml online-endpoint create --name $endpoint_name $workspace_info || { | ||
echo "endpoint create failed"; exit 1; | ||
} | ||
|
||
# deploy model from registry to endpoint in workspace | ||
# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list | ||
az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \ | ||
endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \ | ||
instance_type=$deployment_sku || { | ||
echo "deployment create failed"; exit 1; | ||
} | ||
|
||
# 7. Try a sample scoring request | ||
|
||
# Check if scoring data file exists | ||
if [ -f $scoring_file ]; then | ||
echo "Invoking endpoint $endpoint_name with following input:\n\n" | ||
cat $scoring_file | ||
echo "\n\n" | ||
else | ||
echo "Scoring file $scoring_file does not exist" | ||
exit 1 | ||
fi | ||
|
||
az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || { | ||
echo "endpoint invoke failed"; exit 1; | ||
} | ||
|
||
# 8. Delete the endpoint | ||
az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || { | ||
echo "endpoint delete failed"; exit 1; | ||
} |