Kaggarwal/add textgen cli (#2634)

* Add CLI sample for text-gen * Add cli sample for textgen * resolve comments and fix black formatting issues
Azure · Sep 8, 2023 · fbfe7fc · fbfe7fc
1 parent 280f7a9
commit fbfe7fc
Show file tree

Hide file tree

Showing 4 changed files with 371 additions and 0 deletions.
diff --git a/cli/foundation-models/system/finetune/text-generation/deploy.yml b/cli/foundation-models/system/finetune/text-generation/deploy.yml
@@ -0,0 +1,9 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: demo
+instance_type: Standard_E64S_v3
+instance_count: 1
+liveness_probe:
+  initial_delay: 600
+request_settings:
+  request_timeout_ms: 90000
+
diff --git a/cli/foundation-models/system/finetune/text-generation/download-dataset.py b/cli/foundation-models/system/finetune/text-generation/download-dataset.py
@@ -0,0 +1,84 @@
+# import library to parse command line arguments
+import argparse, os
+import json
+
+parser = argparse.ArgumentParser()
+# add an argument to specify a dataset name to download
+parser.add_argument("--dataset", type=str, default="samsum", help="dataset name")
+# add an argument to specify a dataset name to download
+parser.add_argument(
+    "--dataset_subset", type=str, default="split", help="dataset subset name"
+)
+# add an argument to specify the directory to download the dataset to
+parser.add_argument(
+    "--download_dir",
+    type=str,
+    default="data",
+    help="directory to download the dataset to",
+)
+args = parser.parse_args()
+
+# create the download directory if it does not exist
+if not os.path.exists(args.download_dir):
+    os.makedirs(args.download_dir)
+
+
+# import hugging face datasets library
+from datasets import load_dataset, get_dataset_split_names
+
+for split in get_dataset_split_names(args.dataset):
+    # load the split of the dataset
+    dataset = load_dataset(args.dataset, split=split)
+    # save the split of the dataset to the download directory as json lines file
+    dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
+    # print dataset features
+
+
+# preprocess the dataset
+import pandas as pd
+
+
+def get_preprocessed_samsum(df):
+    prompt = f"Summarize this dialog:\n{{}}\n---\nSummary:\n"
+
+    df["text"] = df["dialogue"].map(prompt.format)
+    df = df.drop(columns=["dialogue", "id"])
+    df = df[["text", "summary"]]
+
+    return df
+
+
+test_df = pd.read_json("./samsum-dataset/test.jsonl", lines=True)
+train_df = pd.read_json("./samsum-dataset/train.jsonl", lines=True)
+validation_df = pd.read_json("./samsum-dataset/validation.jsonl", lines=True)
+# map the train, validation and test dataframes to preprocess function
+train_df = get_preprocessed_samsum(train_df)
+validation_df = get_preprocessed_samsum(validation_df)
+test_df = get_preprocessed_samsum(test_df)
+
+# Save the preprocessed data
+frac = 1
+train_df.sample(frac=frac).to_json(
+    "./samsum-dataset/small_train.jsonl", orient="records", lines=True
+)
+validation_df.sample(frac=frac).to_json(
+    "./samsum-dataset/small_validation.jsonl", orient="records", lines=True
+)
+test_df.sample(frac=frac).to_json(
+    "./samsum-dataset/small_test.jsonl", orient="records", lines=True
+)
+
+
+# read ./samsum-dataset/small_test.jsonl into a pandas dataframe
+test_df = pd.read_json("./samsum-dataset/small_test.jsonl", lines=True)
+# take 2 random samples
+test_df = test_df.sample(n=2)
+# rebuild index
+test_df.reset_index(drop=True, inplace=True)
+test_df.head(2)
+
+# create a json object with the key as "input_data" and value as a list of values from the text column of the test dataframe
+test_json = {"input_data": {"text": list(test_df["text"])}}
+# save the json object to a file named sample_score.json in the ./samsum-dataset folder
+with open("./samsum-dataset/sample_score.json", "w") as f:
+    json.dump(test_json, f)
diff --git a/cli/foundation-models/system/finetune/text-generation/text-generation-pipeline.yml b/cli/foundation-models/system/finetune/text-generation/text-generation-pipeline.yml
@@ -0,0 +1,87 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+
+experiment_name: text-generation-samsum
+
+inputs:
+  compute_model_import: gpu-cluster-big
+  compute_preprocess: gpu-cluster-big
+  compute_finetune: gpu-cluster-big
+  compute_model_evaluation: gpu-cluster-big
+
+  # specify the foundation model available in the azureml system registry
+  mlflow_model_path: 
+    path: azureml://registries/azureml/models/Llama-2-7b/versions/5
+
+  # huggingface_id: 'gpt2' # if you want to use a huggingface model, uncomment this line and comment the above lines
+
+  # map the dataset files to parameters
+  train_file_path: 
+    type: uri_file
+    path: "samsum-dataset/small_train.jsonl"
+  validation_file_path:
+    type: uri_file
+    path: "samsum-dataset/small_validation.jsonl"
+  test_file_path:
+    type: uri_file
+    path: "samsum-dataset/small_test.jsonl"
+
+
+  # The following parameters map to the dataset fields
+  text_key: "text"
+  ground_truth_key: "summary"
+
+  # training settings
+  number_of_gpu_to_use_finetuning: 8
+  num_train_epochs: 3
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  learning_rate: 2e-5
+
+  # optimization params
+  apply_lora: "true"
+  apply_deepspeed: "true"
+  apply_ort: "true"
+  precision: 16
+
+outputs:
+  # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model
+  # registering the model is required to deploy the model to an online or batch endpoint
+  trained_model:
+    type: mlflow_model
+
+settings:
+  force_rerun: true
+  continue_on_step_failure: false
+
+jobs:
+  text_generation_pipeline:
+    type: pipeline
+    component: azureml://registries/azureml/components/text_generation_pipeline/labels/latest
+    inputs:
+      mlflow_model_path: ${{parent.inputs.mlflow_model_path}} 
+
+      compute_model_import: ${{parent.inputs.compute_model_import}}
+      compute_preprocess: ${{parent.inputs.compute_preprocess}}
+      compute_finetune: ${{parent.inputs.compute_finetune}}
+      compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}}
+
+      train_file_path: ${{parent.inputs.train_file_path}}
+      validation_file_path: ${{parent.inputs.validation_file_path}}
+      test_file_path: ${{parent.inputs.test_file_path}}
+
+      text_key: ${{parent.inputs.text_key}}
+      ground_truth_key: ${{parent.inputs.ground_truth_key}}
+
+      number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}}
+      num_train_epochs: ${{parent.inputs.num_train_epochs}}
+      per_device_train_batch_size: ${{parent.inputs.per_device_train_batch_size}}
+      per_device_eval_batch_size: ${{parent.inputs.per_device_eval_batch_size}}
+      learning_rate: ${{parent.inputs.learning_rate}}
+      apply_lora: ${{parent.inputs.apply_lora}}
+      apply_deepspeed: ${{parent.inputs.apply_deepspeed}}
+      apply_ort: ${{parent.inputs.apply_ort}}
+      precision: ${{parent.inputs.precision}}
+
+    outputs:
+      mlflow_model_folder: ${{parent.outputs.trained_model}}
diff --git a/cli/foundation-models/system/finetune/text-generation/text-generation.sh b/cli/foundation-models/system/finetune/text-generation/text-generation.sh
@@ -0,0 +1,191 @@
+set -x
+# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-text-generation
+# the data files are available in the same folder as the above notebook
+
+# script inputs
+subscription_id="<SUBSCRIPTION_ID>"
+resource_group_name="<RESOURCE_GROUP>"
+workspace_name="WORKSPACE_NAME>"
+registry_name="azureml"
+model_registry_name="azureml-meta"
+
+compute_cluster="gpu-cluster-big"
+# if above compute cluster does not exist, create it with the following vm size
+compute_sku="Standard_ND40rs_v2"
+# This is the number of GPUs in a single node of the selected 'vm_size' compute. 
+# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
+# Setting this to more than the number of GPUs will result in an error.
+# CLI command to fetch the number of GPUs
+# az ml compute list-sizes -g <resource_group_name> -w <workspace_name> --subscription <subscription_id> --query "[?name=='<compute_sku>'].{Name:name, Gpus:gpus}" --output table
+gpus_per_node=8
+# This is the foundation model for finetuning
+model_name="Llama-2-7b"
+# using the latest version of the model - not working yet
+model_version=5
+
+version=$(date +%s)
+finetuned_model_name=$model_name"-text-generation"
+endpoint_name="samsum-$version"
+deployment_sku="Standard_E64S_v3"
+
+
+# training data
+train_data="samsum-dataset/small_train.jsonl"
+# validation data
+validation_data="samsum-dataset/small_validation.jsonl"
+# test data
+test_data="samsum-dataset/small_test.jsonl"
+# scoring_file
+scoring_file="samsum-dataset/sample_score.json"
+
+# finetuning job parameters
+finetuning_pipeline_component="text_generation_pipeline"
+# The following parameters map to the dataset fields
+text_key="text"
+ground_truth_key="summary"
+# Training settings
+number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute
+num_train_epochs=3
+per_device_train_batch_size=1
+per_device_eval_batch_size=1
+learning_rate=2e-5
+
+# optimization params
+apply_lora="true"
+apply_deepspeed="false"
+apply_ort="true"
+precision=4
+
+# 1. Setup pre-requisites
+
+if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
+   [ "$resource_group_name" = "<RESOURCE_GROUP>" ] || \
+   [ "$workspace_name" = "<WORKSPACE_NAME>" ]; then 
+    echo "Please update the script with the subscription_id, resource_group_name and workspace_name"
+    exit 1
+fi
+
+az account set -s $subscription_id
+workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name"
+
+# check if $compute_cluster exists, else create it
+if az ml compute show --name $compute_cluster $workspace_info
+then
+    echo "Compute cluster $compute_cluster already exists"
+else
+    echo "Creating compute cluster $compute_cluster"
+    az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || {
+        echo "Failed to create compute cluster $compute_cluster"
+        exit 1
+    }
+fi
+
+# download the dataset
+
+python ./download-dataset.py --download_dir samsum-dataset || {
+    echo "Failed to download dataset"
+    exit 1
+}
+
+# 2. Check if the model exists in the registry
+# need to confirm model show command works for registries outside the tenant (aka system registry)
+if ! az ml model show --name $model_name --version $model_version --registry-name $model_registry_name 
+then
+    echo "Model $model_name:$model_version does not exist in registry $model_registry_name"
+    exit 1
+fi
+
+# 3. Check if training data, validation data and test data exist
+if [ ! -f $train_data ]; then
+    echo "Training data $train_data does not exist"
+    exit 1
+fi
+if [ ! -f $validation_data ]; then
+    echo "Validation data $validation_data does not exist"
+    exit 1
+fi
+if [ ! -f $test_data ]; then
+    echo "Test data $test_data does not exist"
+    exit 1
+fi
+
+# 4. Submit finetuning job using pipeline.yml
+
+# check if the finetuning pipeline component exists
+if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name
+then
+    echo "Finetuning pipeline component $finetuning_pipeline_component does not exist"
+    exit 1
+fi
+
+# need to switch to using latest version for model, currently blocked with a bug.
+# submit finetuning job
+parent_job_name=$( az ml job create --file ./text-generation-pipeline.yml $workspace_info --query name -o tsv --set \
+  jobs.text_generation_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \
+  inputs.compute_model_import=$compute_cluster \
+  inputs.compute_preprocess=$compute_cluster \
+  inputs.compute_finetune=$compute_cluster \
+  inputs.compute_model_evaluation=$compute_cluster \
+  inputs.mlflow_model_path.path="azureml://registries/$model_registry_name/models/$model_name/versions/$model_version" \
+  inputs.train_file_path.path=$train_data \
+  inputs.validation_file_path.path=$validation_data \
+  inputs.test_file_path.path=$test_data \
+  inputs.text_key=$text_key \
+  inputs.ground_truth_key=$ground_truth_key \
+  inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \
+  inputs.num_train_epochs=$num_train_epochs \
+  inputs.per_device_train_batch_size=$per_device_train_batch_size \
+  inputs.per_device_eval_batch_size=$per_device_eval_batch_size \
+  inputs.learning_rate=$learning_rate \
+  inputs.apply_lora=$apply_lora \
+  inputs.apply_deepspeed=$apply_deepspeed \
+  inputs.apply_ort=$apply_ort \
+  inputs.precision=$precision ) || {
+    echo "Failed to submit finetuning job"
+    exit 1
+  }
+
+az ml job stream --name $parent_job_name $workspace_info || {
+    echo "job stream failed"; exit 1;
+}
+
+# 5. Create model in workspace from train job output
+az ml model create --name $finetuned_model_name --version $version --type mlflow_model \
+ --path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info  || {
+    echo "model create in workspace failed"; exit 1;
+}
+
+# 6. Deploy the model to an endpoint
+# create online endpoint 
+az ml online-endpoint create --name $endpoint_name $workspace_info  || {
+    echo "endpoint create failed"; exit 1;
+}
+
+# deploy model from registry to endpoint in workspace
+# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list
+az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \
+  endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \
+  instance_type=$deployment_sku || {
+    echo "deployment create failed"; exit 1;
+}
+
+# 7. Try a sample scoring request
+
+# Check if scoring data file exists
+if [ -f $scoring_file ]; then
+    echo "Invoking endpoint $endpoint_name with following input:\n\n"
+    cat $scoring_file
+    echo "\n\n"
+else
+    echo "Scoring file $scoring_file does not exist"
+    exit 1
+fi
+
+az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || {
+    echo "endpoint invoke failed"; exit 1;
+}
+
+# 8. Delete the endpoint
+az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || {
+    echo "endpoint delete failed"; exit 1;
+}