Skip to content

Commit

Permalink
Kaggarwal/add textgen cli (#2634)
Browse files Browse the repository at this point in the history
* Add CLI sample for text-gen

* Add cli sample for textgen

* resolve comments and fix black formatting issues
  • Loading branch information
aggarwal-k authored Sep 8, 2023
1 parent 280f7a9 commit fbfe7fc
Show file tree
Hide file tree
Showing 4 changed files with 371 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: demo
instance_type: Standard_E64S_v3
instance_count: 1
liveness_probe:
initial_delay: 600
request_settings:
request_timeout_ms: 90000

Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# import library to parse command line arguments
import argparse, os
import json

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="samsum", help="dataset name")
# add an argument to specify a dataset name to download
parser.add_argument(
"--dataset_subset", type=str, default="split", help="dataset subset name"
)
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="data",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)


# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset):
# load the split of the dataset
dataset = load_dataset(args.dataset, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
# print dataset features


# preprocess the dataset
import pandas as pd


def get_preprocessed_samsum(df):
prompt = f"Summarize this dialog:\n{{}}\n---\nSummary:\n"

df["text"] = df["dialogue"].map(prompt.format)
df = df.drop(columns=["dialogue", "id"])
df = df[["text", "summary"]]

return df


test_df = pd.read_json("./samsum-dataset/test.jsonl", lines=True)
train_df = pd.read_json("./samsum-dataset/train.jsonl", lines=True)
validation_df = pd.read_json("./samsum-dataset/validation.jsonl", lines=True)
# map the train, validation and test dataframes to preprocess function
train_df = get_preprocessed_samsum(train_df)
validation_df = get_preprocessed_samsum(validation_df)
test_df = get_preprocessed_samsum(test_df)

# Save the preprocessed data
frac = 1
train_df.sample(frac=frac).to_json(
"./samsum-dataset/small_train.jsonl", orient="records", lines=True
)
validation_df.sample(frac=frac).to_json(
"./samsum-dataset/small_validation.jsonl", orient="records", lines=True
)
test_df.sample(frac=frac).to_json(
"./samsum-dataset/small_test.jsonl", orient="records", lines=True
)


# read ./samsum-dataset/small_test.jsonl into a pandas dataframe
test_df = pd.read_json("./samsum-dataset/small_test.jsonl", lines=True)
# take 2 random samples
test_df = test_df.sample(n=2)
# rebuild index
test_df.reset_index(drop=True, inplace=True)
test_df.head(2)

# create a json object with the key as "input_data" and value as a list of values from the text column of the test dataframe
test_json = {"input_data": {"text": list(test_df["text"])}}
# save the json object to a file named sample_score.json in the ./samsum-dataset folder
with open("./samsum-dataset/sample_score.json", "w") as f:
json.dump(test_json, f)
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline

experiment_name: text-generation-samsum

inputs:
compute_model_import: gpu-cluster-big
compute_preprocess: gpu-cluster-big
compute_finetune: gpu-cluster-big
compute_model_evaluation: gpu-cluster-big

# specify the foundation model available in the azureml system registry
mlflow_model_path:
path: azureml://registries/azureml/models/Llama-2-7b/versions/5

# huggingface_id: 'gpt2' # if you want to use a huggingface model, uncomment this line and comment the above lines

# map the dataset files to parameters
train_file_path:
type: uri_file
path: "samsum-dataset/small_train.jsonl"
validation_file_path:
type: uri_file
path: "samsum-dataset/small_validation.jsonl"
test_file_path:
type: uri_file
path: "samsum-dataset/small_test.jsonl"


# The following parameters map to the dataset fields
text_key: "text"
ground_truth_key: "summary"

# training settings
number_of_gpu_to_use_finetuning: 8
num_train_epochs: 3
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
learning_rate: 2e-5

# optimization params
apply_lora: "true"
apply_deepspeed: "true"
apply_ort: "true"
precision: 16

outputs:
# map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model
# registering the model is required to deploy the model to an online or batch endpoint
trained_model:
type: mlflow_model

settings:
force_rerun: true
continue_on_step_failure: false

jobs:
text_generation_pipeline:
type: pipeline
component: azureml://registries/azureml/components/text_generation_pipeline/labels/latest
inputs:
mlflow_model_path: ${{parent.inputs.mlflow_model_path}}

compute_model_import: ${{parent.inputs.compute_model_import}}
compute_preprocess: ${{parent.inputs.compute_preprocess}}
compute_finetune: ${{parent.inputs.compute_finetune}}
compute_model_evaluation: ${{parent.inputs.compute_model_evaluation}}

train_file_path: ${{parent.inputs.train_file_path}}
validation_file_path: ${{parent.inputs.validation_file_path}}
test_file_path: ${{parent.inputs.test_file_path}}

text_key: ${{parent.inputs.text_key}}
ground_truth_key: ${{parent.inputs.ground_truth_key}}

number_of_gpu_to_use_finetuning: ${{parent.inputs.number_of_gpu_to_use_finetuning}}
num_train_epochs: ${{parent.inputs.num_train_epochs}}
per_device_train_batch_size: ${{parent.inputs.per_device_train_batch_size}}
per_device_eval_batch_size: ${{parent.inputs.per_device_eval_batch_size}}
learning_rate: ${{parent.inputs.learning_rate}}
apply_lora: ${{parent.inputs.apply_lora}}
apply_deepspeed: ${{parent.inputs.apply_deepspeed}}
apply_ort: ${{parent.inputs.apply_ort}}
precision: ${{parent.inputs.precision}}

outputs:
mlflow_model_folder: ${{parent.outputs.trained_model}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
set -x
# the commands in this file map to steps in this notebook: https://aka.ms/azureml-ft-sdk-text-generation
# the data files are available in the same folder as the above notebook

# script inputs
subscription_id="<SUBSCRIPTION_ID>"
resource_group_name="<RESOURCE_GROUP>"
workspace_name="WORKSPACE_NAME>"
registry_name="azureml"
model_registry_name="azureml-meta"

compute_cluster="gpu-cluster-big"
# if above compute cluster does not exist, create it with the following vm size
compute_sku="Standard_ND40rs_v2"
# This is the number of GPUs in a single node of the selected 'vm_size' compute.
# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
# Setting this to more than the number of GPUs will result in an error.
# CLI command to fetch the number of GPUs
# az ml compute list-sizes -g <resource_group_name> -w <workspace_name> --subscription <subscription_id> --query "[?name=='<compute_sku>'].{Name:name, Gpus:gpus}" --output table
gpus_per_node=8
# This is the foundation model for finetuning
model_name="Llama-2-7b"
# using the latest version of the model - not working yet
model_version=5

version=$(date +%s)
finetuned_model_name=$model_name"-text-generation"
endpoint_name="samsum-$version"
deployment_sku="Standard_E64S_v3"


# training data
train_data="samsum-dataset/small_train.jsonl"
# validation data
validation_data="samsum-dataset/small_validation.jsonl"
# test data
test_data="samsum-dataset/small_test.jsonl"
# scoring_file
scoring_file="samsum-dataset/sample_score.json"

# finetuning job parameters
finetuning_pipeline_component="text_generation_pipeline"
# The following parameters map to the dataset fields
text_key="text"
ground_truth_key="summary"
# Training settings
number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute
num_train_epochs=3
per_device_train_batch_size=1
per_device_eval_batch_size=1
learning_rate=2e-5

# optimization params
apply_lora="true"
apply_deepspeed="false"
apply_ort="true"
precision=4

# 1. Setup pre-requisites

if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
[ "$resource_group_name" = "<RESOURCE_GROUP>" ] || \
[ "$workspace_name" = "<WORKSPACE_NAME>" ]; then
echo "Please update the script with the subscription_id, resource_group_name and workspace_name"
exit 1
fi

az account set -s $subscription_id
workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name"

# check if $compute_cluster exists, else create it
if az ml compute show --name $compute_cluster $workspace_info
then
echo "Compute cluster $compute_cluster already exists"
else
echo "Creating compute cluster $compute_cluster"
az ml compute create --name $compute_cluster --type amlcompute --min-instances 0 --max-instances 2 --size $compute_sku $workspace_info || {
echo "Failed to create compute cluster $compute_cluster"
exit 1
}
fi

# download the dataset

python ./download-dataset.py --download_dir samsum-dataset || {
echo "Failed to download dataset"
exit 1
}

# 2. Check if the model exists in the registry
# need to confirm model show command works for registries outside the tenant (aka system registry)
if ! az ml model show --name $model_name --version $model_version --registry-name $model_registry_name
then
echo "Model $model_name:$model_version does not exist in registry $model_registry_name"
exit 1
fi

# 3. Check if training data, validation data and test data exist
if [ ! -f $train_data ]; then
echo "Training data $train_data does not exist"
exit 1
fi
if [ ! -f $validation_data ]; then
echo "Validation data $validation_data does not exist"
exit 1
fi
if [ ! -f $test_data ]; then
echo "Test data $test_data does not exist"
exit 1
fi

# 4. Submit finetuning job using pipeline.yml

# check if the finetuning pipeline component exists
if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name
then
echo "Finetuning pipeline component $finetuning_pipeline_component does not exist"
exit 1
fi

# need to switch to using latest version for model, currently blocked with a bug.
# submit finetuning job
parent_job_name=$( az ml job create --file ./text-generation-pipeline.yml $workspace_info --query name -o tsv --set \
jobs.text_generation_pipeline.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \
inputs.compute_model_import=$compute_cluster \
inputs.compute_preprocess=$compute_cluster \
inputs.compute_finetune=$compute_cluster \
inputs.compute_model_evaluation=$compute_cluster \
inputs.mlflow_model_path.path="azureml://registries/$model_registry_name/models/$model_name/versions/$model_version" \
inputs.train_file_path.path=$train_data \
inputs.validation_file_path.path=$validation_data \
inputs.test_file_path.path=$test_data \
inputs.text_key=$text_key \
inputs.ground_truth_key=$ground_truth_key \
inputs.number_of_gpu_to_use_finetuning=$number_of_gpu_to_use_finetuning \
inputs.num_train_epochs=$num_train_epochs \
inputs.per_device_train_batch_size=$per_device_train_batch_size \
inputs.per_device_eval_batch_size=$per_device_eval_batch_size \
inputs.learning_rate=$learning_rate \
inputs.apply_lora=$apply_lora \
inputs.apply_deepspeed=$apply_deepspeed \
inputs.apply_ort=$apply_ort \
inputs.precision=$precision ) || {
echo "Failed to submit finetuning job"
exit 1
}

az ml job stream --name $parent_job_name $workspace_info || {
echo "job stream failed"; exit 1;
}

# 5. Create model in workspace from train job output
az ml model create --name $finetuned_model_name --version $version --type mlflow_model \
--path azureml://jobs/$parent_job_name/outputs/trained_model $workspace_info || {
echo "model create in workspace failed"; exit 1;
}

# 6. Deploy the model to an endpoint
# create online endpoint
az ml online-endpoint create --name $endpoint_name $workspace_info || {
echo "endpoint create failed"; exit 1;
}

# deploy model from registry to endpoint in workspace
# You can find here the list of SKU's supported for deployment - https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list
az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \
endpoint_name=$endpoint_name model=azureml:$finetuned_model_name:$version \
instance_type=$deployment_sku || {
echo "deployment create failed"; exit 1;
}

# 7. Try a sample scoring request

# Check if scoring data file exists
if [ -f $scoring_file ]; then
echo "Invoking endpoint $endpoint_name with following input:\n\n"
cat $scoring_file
echo "\n\n"
else
echo "Scoring file $scoring_file does not exist"
exit 1
fi

az ml online-endpoint invoke --name $endpoint_name --request-file $scoring_file $workspace_info || {
echo "endpoint invoke failed"; exit 1;
}

# 8. Delete the endpoint
az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || {
echo "endpoint delete failed"; exit 1;
}

0 comments on commit fbfe7fc

Please sign in to comment.