diff --git a/cli/foundation-models/system/finetune/text-generation/deploy.yml b/cli/foundation-models/system/finetune/text-generation/deploy.yml new file mode 100644 index 00000000000..ad0040b1b03 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-generation/deploy.yml @@ -0,0 +1,9 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_E64S_v3 +instance_count: 1 +liveness_probe: + initial_delay: 600 +request_settings: + request_timeout_ms: 90000 + diff --git a/cli/foundation-models/system/finetune/text-generation/download-dataset.py b/cli/foundation-models/system/finetune/text-generation/download-dataset.py new file mode 100644 index 00000000000..5bcc64c4de3 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-generation/download-dataset.py @@ -0,0 +1,84 @@ +# import library to parse command line arguments +import argparse, os +import json + +parser = argparse.ArgumentParser() +# add an argument to specify a dataset name to download +parser.add_argument("--dataset", type=str, default="samsum", help="dataset name") +# add an argument to specify a dataset name to download +parser.add_argument( + "--dataset_subset", type=str, default="split", help="dataset subset name" +) +# add an argument to specify the directory to download the dataset to +parser.add_argument( + "--download_dir", + type=str, + default="data", + help="directory to download the dataset to", +) +args = parser.parse_args() + +# create the download directory if it does not exist +if not os.path.exists(args.download_dir): + os.makedirs(args.download_dir) + + +# import hugging face datasets library +from datasets import load_dataset, get_dataset_split_names + +for split in get_dataset_split_names(args.dataset): + # load the split of the dataset + dataset = load_dataset(args.dataset, split=split) + # save the split of the dataset to the download directory as json lines file + dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) + # print dataset features + + +# preprocess the dataset +import pandas as pd + + +def get_preprocessed_samsum(df): + prompt = f"Summarize this dialog:\n{{}}\n---\nSummary:\n" + + df["text"] = df["dialogue"].map(prompt.format) + df = df.drop(columns=["dialogue", "id"]) + df = df[["text", "summary"]] + + return df + + +test_df = pd.read_json("./samsum-dataset/test.jsonl", lines=True) +train_df = pd.read_json("./samsum-dataset/train.jsonl", lines=True) +validation_df = pd.read_json("./samsum-dataset/validation.jsonl", lines=True) +# map the train, validation and test dataframes to preprocess function +train_df = get_preprocessed_samsum(train_df) +validation_df = get_preprocessed_samsum(validation_df) +test_df = get_preprocessed_samsum(test_df) + +# Save the preprocessed data +frac = 1 +train_df.sample(frac=frac).to_json( + "./samsum-dataset/small_train.jsonl", orient="records", lines=True +) +validation_df.sample(frac=frac).to_json( + "./samsum-dataset/small_validation.jsonl", orient="records", lines=True +) +test_df.sample(frac=frac).to_json( + "./samsum-dataset/small_test.jsonl", orient="records", lines=True +) + + +# read ./samsum-dataset/small_test.jsonl into a pandas dataframe +test_df = pd.read_json("./samsum-dataset/small_test.jsonl", lines=True) +# take 2 random samples +test_df = test_df.sample(n=2) +# rebuild index +test_df.reset_index(drop=True, inplace=True) +test_df.head(2) + +# create a json object with the key as "input_data" and value as a list of values from the text column of the test dataframe +test_json = {"input_data": {"text": list(test_df["text"])}} +# save the json object to a file named sample_score.json in the ./samsum-dataset folder +with open("./samsum-dataset/sample_score.json", "w") as f: + json.dump(test_json, f) diff --git a/cli/foundation-models/system/finetune/text-generation/text-generation-pipeline.yml b/cli/foundation-models/system/finetune/text-generation/text-generation-pipeline.yml new file mode 100644 index 00000000000..8ca5e226808 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-generation/text-generation-pipeline.yml @@ -0,0 +1,87 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: text-generation-samsum + +inputs: + compute_model_import: gpu-cluster-big + compute_preprocess: gpu-cluster-big + compute_finetune: gpu-cluster-big + compute_model_evaluation: gpu-cluster-big + + # specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml/models/Llama-2-7b/versions/5 + + # huggingface_id: 'gpt2' # if you want to use a huggingface model, uncomment this line and comment the above lines + + # map the dataset files to parameters + train_file_path: + type: uri_file + path: "samsum-dataset/small_train.jsonl" + validation_file_path: + type: uri_file + path: "samsum-dataset/small_validation.jsonl" + test_file_path: + type: uri_file + path: "samsum-dataset/small_test.jsonl" + + + # The following parameters map to the dataset fields + text_key: "text" + ground_truth_key: "summary" + + # training settings + number_of_gpu_to_use_finetuning: 8 + num_train_epochs: 3 + per_device_train_batch_size: 1 + per_device_eval_batch_size: 1 + learning_rate: 2e-5 + + # optimization params + apply_lora: "true" + apply_deepspeed: "true" + apply_ort: "true" + precision: 16 + +outputs: + # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model + # registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + continue_on_step_failure: false + +jobs: + text_generation_pipeline: + type: pipeline + component: azureml://registries/azureml/components/text_generation_pipeline/labels/latest + inputs: + mlflow_model_path: ${{parent.inputs.mlflow_model_path}} + + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_preprocess: ${{parent.inputs.compute_preprocess}} + compute_finetune: ${{parent.inputs.compute_finetune}} + compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}} + + train_file_path: ${{parent.inputs.train_file_path}} + validation_file_path: ${{parent.inputs.validation_file_path}} + test_file_path: ${{parent.inputs.test_file_path}} + + text_key: ${{parent.inputs.text_key}} + ground_truth_key: ${{parent.inputs.ground_truth_key}} + + number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}} + num_train_epochs: ${{parent.inputs.num_train_epochs}} + per_device_train_batch_size: ${{parent.inputs.per_device_train_batch_size}} + per_device_eval_batch_size: ${{parent.inputs.per_device_eval_batch_size}} + learning_rate: ${{parent.inputs.learning_rate}} + apply_lora: ${{parent.inputs.apply_lora}} + apply_deepspeed: ${{parent.inputs.apply_deepspeed}} + apply_ort: ${{parent.inputs.apply_ort}} + precision: ${{parent.inputs.precision}} + + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} diff --git a/cli/foundation-models/system/finetune/text-generation/text-generation.sh b/cli/foundation-models/system/finetune/text-generation/text-generation.sh new file mode 100644 index 00000000000..1566c36f662 --- /dev/null +++ b/cli/foundation-models/system/finetune/text-generation/text-generation.sh @@ -0,0 +1,191 @@ +set -x +# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-text-generation +# the data files are available in the same folder as the above notebook + +# script inputs +subscription_id="" +resource_group_name="" +workspace_name="WORKSPACE_NAME>" +registry_name="azureml" +model_registry_name="azureml-meta" + +compute_cluster="gpu-cluster-big" +# if above compute cluster does not exist, create it with the following vm size +compute_sku="Standard_ND40rs_v2" +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +# CLI command to fetch the number of GPUs +# az ml compute list-sizes -g -w --subscription --query "[?name==''].{Name:name, Gpus:gpus}" --output table +gpus_per_node=8 +# This is the foundation model for finetuning +model_name="Llama-2-7b" +# using the latest version of the model - not working yet +model_version=5 + +version=$(date +%s) +finetuned_model_name=$model_name"-text-generation" +endpoint_name="samsum-$version" +deployment_sku="Standard_E64S_v3" + + +# training data +train_data="samsum-dataset/small_train.jsonl" +# validation data +validation_data="samsum-dataset/small_validation.jsonl" +# test data +test_data="samsum-dataset/small_test.jsonl" +# scoring_file +scoring_file="samsum-dataset/sample_score.json" + +# finetuning job parameters +finetuning_pipeline_component="text_generation_pipeline" +# The following parameters map to the dataset fields +text_key="text" +ground_truth_key="summary" +# Training settings +number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute +num_train_epochs=3 +per_device_train_batch_size=1 +per_device_eval_batch_size=1 +learning_rate=2e-5 + +# optimization params +apply_lora="true" +apply_deepspeed="false" +apply_ort="true" +precision=4 + +# 1. Setup pre-requisites + +if [ "$subscription_id" = "" ] || \ + [ "$resource_group_name" = "" ] || \ + [ "$workspace_name" = "" ]; then + echo "Please update the script with the subscription_id, resource_group_name and workspace_name" + exit 1 +fi + +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# check if $compute_cluster exists, else create it +if az ml compute show --name $compute_cluster $workspace_info +then + echo "Compute cluster $compute_cluster already exists" +else + echo "Creating compute cluster $compute_cluster" + az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster" + exit 1 + } +fi + +# download the dataset + +python ./download-dataset.py --download_dir samsum-dataset || { + echo "Failed to download dataset" + exit 1 +} + +# 2. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $model_name --version $model_version --registry-name $model_registry_name +then + echo "Model $model_name:$model_version does not exist in registry $model_registry_name" + exit 1 +fi + +# 3. Check if training data, validation data and test data exist +if [ ! -f $train_data ]; then + echo "Training data $train_data does not exist" + exit 1 +fi +if [ ! -f $validation_data ]; then + echo "Validation data $validation_data does not exist" + exit 1 +fi +if [ ! -f $test_data ]; then + echo "Test data $test_data does not exist" + exit 1 +fi + +# 4. Submit finetuning job using pipeline.yml + +# check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# need to switch to using latest version for model, currently blocked with a bug. +# submit finetuning job +parent_job_name=$( az ml job create --file ./text-generation-pipeline.yml $workspace_info --query name -o tsv --set \ + jobs.text_generation_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ + inputs.compute_model_import=$compute_cluster \ + inputs.compute_preprocess=$compute_cluster \ + inputs.compute_finetune=$compute_cluster \ + inputs.compute_model_evaluation=$compute_cluster \ + inputs.mlflow_model_path.path="azureml://registries/$model_registry_name/models/$model_name/versions/$model_version" \ + inputs.train_file_path.path=$train_data \ + inputs.validation_file_path.path=$validation_data \ + inputs.test_file_path.path=$test_data \ + inputs.text_key=$text_key \ + inputs.ground_truth_key=$ground_truth_key \ + inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \ + inputs.num_train_epochs=$num_train_epochs \ + inputs.per_device_train_batch_size=$per_device_train_batch_size \ + inputs.per_device_eval_batch_size=$per_device_eval_batch_size \ + inputs.learning_rate=$learning_rate \ + inputs.apply_lora=$apply_lora \ + inputs.apply_deepspeed=$apply_deepspeed \ + inputs.apply_ort=$apply_ort \ + inputs.precision=$precision ) || { + echo "Failed to submit finetuning job" + exit 1 + } + +az ml job stream --name $parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 5. Create model in workspace from train job output +az ml model create --name $finetuned_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 6. Deploy the model to an endpoint +# create online endpoint +az ml online-endpoint create --name $endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list +az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \ + endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 7. Try a sample scoring request + +# Check if scoring data file exists +if [ -f $scoring_file ]; then + echo "Invoking endpoint $endpoint_name with following input:\n\n" + cat $scoring_file + echo "\n\n" +else + echo "Scoring file $scoring_file does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 8. Delete the endpoint +az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +}