From 1a9bb6d18e495798a13f290d7828aa925d74bdfe Mon Sep 17 00:00:00 2001 From: Vlad Shulman Date: Mon, 25 Mar 2024 20:55:05 -0700 Subject: [PATCH 1/3] 2tp llama-2-7b --- llama/llama-2-7b-trt-llm-2tp/README.md | 68 +++++ .../llama-2-7b-trt-llm-2tp/TRT-LLM-README.md | 91 ++++++ llama/llama-2-7b-trt-llm-2tp/config.yaml | 38 +++ .../data/.gitattributes | 37 +++ .../llama-2-7b-trt-llm-2tp/model/__init__.py | 0 llama/llama-2-7b-trt-llm-2tp/model/model.py | 139 ++++++++++ .../llama-2-7b-trt-llm-2tp/packages/client.py | 155 +++++++++++ .../ensemble/config.pbtxt | 246 +++++++++++++++++ .../postprocessing/1/model.py | 180 ++++++++++++ .../postprocessing/config.pbtxt | 64 +++++ .../preprocessing/1/model.py | 259 ++++++++++++++++++ .../preprocessing/config.pbtxt | 99 +++++++ .../tensorrt_llm/config.pbtxt | 208 ++++++++++++++ .../llama-2-7b-trt-llm-2tp/packages/utils.py | 73 +++++ 14 files changed, 1657 insertions(+) create mode 100644 llama/llama-2-7b-trt-llm-2tp/README.md create mode 100644 llama/llama-2-7b-trt-llm-2tp/TRT-LLM-README.md create mode 100644 llama/llama-2-7b-trt-llm-2tp/config.yaml create mode 100644 llama/llama-2-7b-trt-llm-2tp/data/.gitattributes create mode 100644 llama/llama-2-7b-trt-llm-2tp/model/__init__.py create mode 100644 llama/llama-2-7b-trt-llm-2tp/model/model.py create mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/client.py create mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/ensemble/config.pbtxt create mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/1/model.py create mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/config.pbtxt create mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/1/model.py create mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/config.pbtxt create mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt create mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/utils.py diff --git a/llama/llama-2-7b-trt-llm-2tp/README.md b/llama/llama-2-7b-trt-llm-2tp/README.md new file mode 100644 index 00000000..97cdc88a --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/README.md @@ -0,0 +1,68 @@ +# LLaMA2-7B-Chat Truss + +This is a [Truss](https://truss.baseten.co/) for an fp8 TRT 2TP version of LLaMA2-7B-Chat. Llama is a family of language models released by Meta. This README will walk you through how to deploy this Truss on Baseten to get your own instance of LLaMA2-7B-Chat. + +**Warning: This example is only intended for usage on a single A100, changing your resource type for this deployment will result in unsupported behavior** + +## Truss + +Truss is an open-source model serving framework developed by Baseten. It allows you to develop and deploy machine learning models onto Baseten. Using Truss, you can develop a GPU model using [live-reload](https://baseten.co/blog/technical-deep-dive-truss-live-reload), package models and their associated code, create Docker containers and deploy on Baseten. + +## Deploying LLaMA2-7B-Chat + +First, clone this repository: + +```sh +git clone https://github.com/basetenlabs/truss-examples/ +cd llama/llama-2-7b-trt-llm +``` + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + +With `llama-2-7b-trt-llm` as your working directory, you can deploy the model with: + +```sh +truss push --publish +``` + +Paste your Baseten API key if prompted. + +For more information, see [Truss documentation](https://truss.baseten.co). + +## LLaMA2-7B API documentation + +This section provides an overview of the LLaMA2-7B API, its parameters, and how to use it. The API consists of a single route named `predict`, which you can invoke to generate text based on the provided instruction. + +### API route: `predict` + +We expect requests will the following information: + +- `prompt` (str): The prompt you'd like to complete +- `max_tokens` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt. +- `beam_width` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. +- `bad_words_list` (list, default:[]): A list of words to not include in generated output. +- `stop_words_list` (list, default:[]): A list of words to stop generation upon encountering. +- `repetition_penalty` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. + +This Truss will stream responses back. Responses will be buffered chunks of text. + +## Example usage + +```sh +truss predict -d '{"prompt": "What is the meaning of life?"}' +``` + +You can also invoke your model via a REST API + +```sh +curl -X POST " https://app.baseten.co/models/YOUR_MODEL_ID/predict" \ + -H "Content-Type: application/json" \ + -H 'Authorization: Api-Key {YOUR_API_KEY}' \ + -d '{ + "prompt": "What's the meaning of life?", + }' + +``` diff --git a/llama/llama-2-7b-trt-llm-2tp/TRT-LLM-README.md b/llama/llama-2-7b-trt-llm-2tp/TRT-LLM-README.md new file mode 100644 index 00000000..981f2050 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/TRT-LLM-README.md @@ -0,0 +1,91 @@ + + +# TRTLLM + +### Overview +This Truss adds support for TRT-LLM engines via Triton Inference Server. TRT-LLM is a highly-performant language model runtime. We leverage the C++ runtime to take advantage of in-flight batching (aka continous batching). + +### Prerequisites + +To use this Truss, your engine must be built with in-flight batching support. Refer to your architecture-specific `build.py` re: how to build with in-flight-batching support. + +### Config + +This Truss is primarily config driven. This means that most settings you'll need to edit are located in the `config.yaml`. These settings are all located underneath the `model_metadata` key. + +- `tensor_parallelism` (int): If you built your model with tensor parallelism support, you'll need to set this value with the same value used during the build engine step. This value should be the same as the number of GPUs in the `resources` section. + +*Pipeline parallelism is not supported in this version but will be added later. As noted from Nvidia, pipeline parallelism reduces the need for high-bandwidth communication but may incur load-balancing issues and may be less efficient in terms of GPU utilization.* + +- `engine_repository` (str): We expect engines to be uploaded to Huggingface with a flat directory structure (i.e the engine and associated files are not underneath a folder structure). This value is the full `{org_name}/{repo_name}` string. Engines can be private or public. + +- `tokenizer_repository` (str): Engines do not come bundled with their own tokenizer. This is the Huggingface repository where we can find a tokenizer. Tokenizers can be private or public. + +If the engine and repository tokenizers are private, you'll need to update the `secrets` section of the `config.yaml` as follows: + +``` +secrets: + hf_access_token: "my_hf_api_key" +``` + +### Performance + +TRT-LLM engines are designed to be highly performant. Once your Truss has been deployed, you may find that you're not fully utilizing the GPU. The following are levers to improve performance but require trial-and-error to identify appropriates. All of these values live inside the `config.pbtxt` for a given ensemble model. + +#### Preprocessing / Postprocessing + +``` +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] +``` +By default, we load 1 instance of the pre/post models. If you find that the tokenizer is a bottleneck, increasing the `count` variable here will load more replicas of these models and Triton will automatically load balance across model instances. + +### Tensorrt LLM +``` +parameters: { + key: "max_tokens_in_paged_kv_cache" + value: { + string_value: "10000" + } +} +``` +By default, we set the `max_tokens_in_paged_kv_cache` to 10000. For a 7B model on 1 A100 with a batch size of 8, we have over 60GB of GPU memory left over. We can increase this value to 100k comfortably and allow for more tokens in the KV cache. Your mileage will vary based on the size of your model and the hardware you're running on. + +``` +parameters: { + key: "kv_cache_free_gpu_mem_fraction" + value: { + string_value: "0.1" + } +} +``` +By default, if `max_tokens_in_paged_kv_cache` is unset, Triton Inference Server will attempt to preallocate `kv_cache_free_gpu_mem_fraction` fraction of free gpu memory for the KV cache. + +``` +parameters: { + key: "max_num_sequences" + value: { + string_value: "64" + } +} +``` +The `max_num_sequences` param is the maximum numbers of requests that the inference server can maintain state for at a given time (state = KV cache + decoder state). +See this [comment](https://github.com/NVIDIA/TensorRT-LLM/issues/65#issuecomment-1774332446) for more details. Setting this value higher allows for more parallel processing but uses more GPU memory. + +### API + +We expect requests will the following information: + + +- ```prompt``` (str): The prompt you'd like to complete +- ```max_tokens``` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt. +- ```beam_width``` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. +- ```bad_words_list``` (list, default:[]): A list of words to not include in generated output. +- ```stop_words_list``` (list, default:[]): A list of words to stop generation upon encountering. +- ```repetition_penalty``` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. + +This Truss will stream responses back. Responses will be buffered chunks of text. diff --git a/llama/llama-2-7b-trt-llm-2tp/config.yaml b/llama/llama-2-7b-trt-llm-2tp/config.yaml new file mode 100644 index 00000000..06369d60 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/config.yaml @@ -0,0 +1,38 @@ +apply_library_patches: true +base_image: + image: docker.io/baseten/trtllm-server:r23.12_baseten_v0.9.0.dev2024022000 + python_executable_path: /usr/bin/python3 +description: Generate text from a prompt with this seven billion parameter language + model. +build: + arguments: + engine_repository: strangervb/mistral_fp8_i100_o400_tp2_v0 + pipeline_parallel_count: 1 + tensor_parallel_count: 2 + tokenizer_repository: NousResearch/Llama-2-7b-chat-hf +environment_variables: {} +external_package_dirs: [] +model_metadata: + avatar_url: https://cdn.baseten.co/production/static/explore/meta.png + cover_image_url: https://cdn.baseten.co/production/static/explore/llama.png + engine_repository: strangervb/mistral_fp8_i100_o400_tp2_v0 + example_model_input: + max_tokens: 100 + prompt: What's the meaning of life? + repo_id: NousResearch/Llama-2-7b-chat-hf + tags: + - text-generation + tensor_parallelism: 2 + tokenizer_repository: NousResearch/Llama-2-7b-chat-hf +model_name: Llama 7B Chat TRT 2TP +python_version: py311 +requirements: +- tritonclient[all] +- transformers +- jinja2 +resources: + accelerator: H100:2 + use_gpu: true +runtime: + predict_concurrency: 4 +secrets: {} diff --git a/llama/llama-2-7b-trt-llm-2tp/data/.gitattributes b/llama/llama-2-7b-trt-llm-2tp/data/.gitattributes new file mode 100644 index 00000000..728629f3 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/data/.gitattributes @@ -0,0 +1,37 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +gpt_float16_tp2_rank0.engine filter=lfs diff=lfs merge=lfs -text +gpt_float16_tp2_rank1.engine filter=lfs diff=lfs merge=lfs -text diff --git a/llama/llama-2-7b-trt-llm-2tp/model/__init__.py b/llama/llama-2-7b-trt-llm-2tp/model/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/llama/llama-2-7b-trt-llm-2tp/model/model.py b/llama/llama-2-7b-trt-llm-2tp/model/model.py new file mode 100644 index 00000000..a1306df1 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/model/model.py @@ -0,0 +1,139 @@ +import os +from itertools import count +from pathlib import Path +from threading import Thread + +import numpy as np +from client import TritonClient, UserData +from transformers import AutoTokenizer +from utils import download_engine, prepare_grpc_tensor, server_loaded + +TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/") + + +class Model: + def __init__(self, **kwargs): + self._data_dir = kwargs["data_dir"] + self._config = kwargs["config"] + self._secrets = kwargs["secrets"] + self._request_id_counter = count(start=1) + self.triton_client = None + self.tokenizer = None + self.uses_openai_api = ( + "openai-compatible" in self._config["model_metadata"]["tags"] + ) + + def load(self): + tensor_parallel_count = self._config["model_metadata"].get( + "tensor_parallelism", 1 + ) + pipeline_parallel_count = self._config["model_metadata"].get( + "pipeline_parallelism", 1 + ) + if "hf_access_token" in self._secrets._base_secrets.keys(): + hf_access_token = self._secrets["hf_access_token"] + else: + hf_access_token = None + is_external_engine_repo = "engine_repository" in self._config["model_metadata"] + + # Instantiate TritonClient + self.triton_client = TritonClient( + data_dir=self._data_dir, + model_repository_dir=TRITON_MODEL_REPOSITORY_PATH, + parallel_count=tensor_parallel_count * pipeline_parallel_count, + ) + + # Download model from Hugging Face Hub if specified + if is_external_engine_repo: + if not server_loaded(): + download_engine( + engine_repository=self._config["model_metadata"][ + "engine_repository" + ], + fp=self._data_dir, + auth_token=hf_access_token, + ) + + # Load Triton Server and model + tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"] + env = {"triton_tokenizer_repository": tokenizer_repository} + if hf_access_token is not None: + env["HUGGING_FACE_HUB_TOKEN"] = hf_access_token + + self.triton_client.load_server_and_model(env=env) + + # setup eos token + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_repository, token=hf_access_token + ) + self.eos_token_id = self.tokenizer.eos_token_id + + def predict(self, model_input): + user_data = UserData() + model_name = "ensemble" + stream_uuid = str(os.getpid()) + str(next(self._request_id_counter)) + + if self.uses_openai_api: + prompt = self.tokenizer.apply_chat_template( + model_input.get("messages"), + tokenize=False, + ) + else: + prompt = model_input.get("prompt") + + max_tokens = model_input.get("max_tokens", 50) + beam_width = model_input.get("beam_width", 1) + bad_words_list = model_input.get("bad_words_list", [""]) + stop_words_list = model_input.get("stop_words_list", [""]) + repetition_penalty = model_input.get("repetition_penalty", 1.0) + ignore_eos = model_input.get("ignore_eos", False) + stream = model_input.get("stream", True) + + input0 = [[prompt]] + input0_data = np.array(input0).astype(object) + output0_len = np.ones_like(input0).astype(np.uint32) * max_tokens + bad_words_list = np.array([bad_words_list], dtype=object) + stop_words_list = np.array([stop_words_list], dtype=object) + stream_data = np.array([[stream]], dtype=bool) + beam_width_data = np.array([[beam_width]], dtype=np.uint32) + repetition_penalty_data = np.array([[repetition_penalty]], dtype=np.float32) + + inputs = [ + prepare_grpc_tensor("text_input", input0_data), + prepare_grpc_tensor("max_tokens", output0_len), + prepare_grpc_tensor("bad_words", bad_words_list), + prepare_grpc_tensor("stop_words", stop_words_list), + prepare_grpc_tensor("stream", stream_data), + prepare_grpc_tensor("beam_width", beam_width_data), + prepare_grpc_tensor("repetition_penalty", repetition_penalty_data), + ] + + if not ignore_eos: + end_id_data = np.array([[self.eos_token_id]], dtype=np.uint32) + inputs.append(prepare_grpc_tensor("end_id", end_id_data)) + else: + # do nothing, trt-llm by default doesn't stop on `eos` + pass + + # Start GRPC stream in a separate thread + stream_thread = Thread( + target=self.triton_client.start_grpc_stream, + args=(user_data, model_name, inputs, stream_uuid), + ) + stream_thread.start() + + def generate(): + # Yield results from the queue + for i in TritonClient.stream_predict(user_data): + yield i + + # Clean up GRPC stream and thread + self.triton_client.stop_grpc_stream(stream_uuid, stream_thread) + + if stream: + return generate() + else: + if self.uses_openai_api: + return "".join(generate()) + else: + return {"text": "".join(generate())} diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/client.py b/llama/llama-2-7b-trt-llm-2tp/packages/client.py new file mode 100644 index 00000000..a996b074 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/packages/client.py @@ -0,0 +1,155 @@ +import json +import os +import subprocess +import time +from functools import partial +from pathlib import Path +from queue import Queue +from threading import Thread + +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException +from utils import ( + GRPC_SERVICE_PORT, + HTTP_SERVICE_PORT, + prepare_model_repository, + server_loaded, +) + + +class UserData: + def __init__(self): + self._completed_requests = Queue() + + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + + +class TritonClient: + def __init__(self, data_dir: Path, model_repository_dir: Path, parallel_count=1): + self._data_dir = data_dir + self._model_repository_dir = model_repository_dir + self._parallel_count = parallel_count + self._http_client = None + self._grpc_client_map = {} + + def start_grpc_stream(self, user_data, model_name, inputs, stream_uuid): + """Starts a GRPC stream and sends a request to the Triton server.""" + grpc_client_instance = grpcclient.InferenceServerClient( + url=f"localhost:{GRPC_SERVICE_PORT}", verbose=False + ) + self._grpc_client_map[stream_uuid] = grpc_client_instance + grpc_client_instance.start_stream(callback=partial(callback, user_data)) + grpc_client_instance.async_stream_infer( + model_name, + inputs, + request_id=stream_uuid, + enable_empty_final_response=True, + ) + + def stop_grpc_stream(self, stream_uuid, stream_thread: Thread): + """Closes a GRPC stream and stops the associated thread.""" + triton_grpc_stream = self._grpc_client_map[stream_uuid] + triton_grpc_stream.stop_stream() + stream_thread.join() + del self._grpc_client_map[stream_uuid] + + def start_server( + self, + mpi: int = 1, + env: dict = {}, + ): + """Triton Inference Server has different startup commands depending on + whether it is running in a TP=1 or TP>1 configuration. This function + starts the server with the appropriate command.""" + if mpi == 1: + command = [ + "tritonserver", + "--model-repository", + str(self._model_repository_dir), + "--grpc-port", + f"{GRPC_SERVICE_PORT}", + "--http-port", + f"{HTTP_SERVICE_PORT}", + ] + command = [ + "mpirun", + "--allow-run-as-root", + ] + for i in range(mpi): + command += [ + "-n", + "1", + "tritonserver", + "--model-repository", + str(self._model_repository_dir), + "--grpc-port", + f"{GRPC_SERVICE_PORT}", + "--http-port", + f"{HTTP_SERVICE_PORT}", + "--disable-auto-complete-config", + f"--backend-config=python,shm-region-prefix-name=prefix{str(i)}_", + ":", + ] + return subprocess.Popen( + command, + env={**os.environ, **env}, + ) + + def load_server_and_model(self, env: dict): + """Loads the Triton server and the model.""" + if not server_loaded(): + prepare_model_repository(self._data_dir) + self.start_server(mpi=self._parallel_count, env=env) + + self._http_client = httpclient.InferenceServerClient( + url=f"localhost:{HTTP_SERVICE_PORT}", verbose=False + ) + is_server_up = False + while not is_server_up: + try: + is_server_up = self._http_client.is_server_live() + except ConnectionRefusedError: + time.sleep(2) + continue + + while self._http_client.is_model_ready(model_name="ensemble") == False: + time.sleep(2) + continue + + @staticmethod + def stream_predict(user_data: UserData): + """Static method to yield predictions or errors based on input and a streaming user_data queue.""" + + def _is_final_response(result): + """Check if the given result is a final response according to Triton's specification.""" + if isinstance(result, InferenceServerException): + return True + + if result: + final_response_param = result.get_response().parameters.get( + "triton_final_response" + ) + return ( + final_response_param.bool_param if final_response_param else False + ) + return False + + result = None + + while not _is_final_response(result): + try: + result = user_data._completed_requests.get() + if not isinstance(result, InferenceServerException): + res = result.as_numpy("text_output") + yield res[0].decode("utf-8") + else: + yield json.dumps({"status": "error", "message": result.message()}) + except Exception as e: + yield json.dumps({"status": "error", "message": str(e)}) + break diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/ensemble/config.pbtxt b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/ensemble/config.pbtxt new file mode 100644 index 00000000..01131a7e --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/ensemble/config.pbtxt @@ -0,0 +1,246 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "ensemble" +platform: "ensemble" +max_batch_size: 128 +input [ + { + name: "text_input" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "max_tokens" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "bad_words" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "stop_words" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "pad_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "length_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "min_length" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + optional: true + }, + { + name: "stream" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "text_output" + data_type: TYPE_STRING + dims: [ -1, -1 ] + } +] +ensemble_scheduling { + step [ + { + model_name: "preprocessing" + model_version: -1 + input_map { + key: "QUERY" + value: "text_input" + } + input_map { + key: "REQUEST_OUTPUT_LEN" + value: "max_tokens" + } + input_map { + key: "BAD_WORDS_DICT" + value: "bad_words" + } + input_map { + key: "STOP_WORDS_DICT" + value: "stop_words" + } + output_map { + key: "REQUEST_INPUT_LEN" + value: "_REQUEST_INPUT_LEN" + } + output_map { + key: "INPUT_ID" + value: "_INPUT_ID" + } + output_map { + key: "REQUEST_OUTPUT_LEN" + value: "_REQUEST_OUTPUT_LEN" + } + }, + { + model_name: "tensorrt_llm" + model_version: -1 + input_map { + key: "input_ids" + value: "_INPUT_ID" + } + input_map { + key: "input_lengths" + value: "_REQUEST_INPUT_LEN" + } + input_map { + key: "request_output_len" + value: "_REQUEST_OUTPUT_LEN" + } + input_map { + key: "end_id" + value: "end_id" + } + input_map { + key: "pad_id" + value: "pad_id" + } + input_map { + key: "runtime_top_k" + value: "top_k" + } + input_map { + key: "runtime_top_p" + value: "top_p" + } + input_map { + key: "temperature" + value: "temperature" + } + input_map { + key: "len_penalty" + value: "length_penalty" + } + input_map { + key: "repetition_penalty" + value: "repetition_penalty" + } + input_map { + key: "min_length" + value: "min_length" + } + input_map { + key: "presence_penalty" + value: "presence_penalty" + } + input_map { + key: "random_seed" + value: "random_seed" + } + input_map { + key: "beam_width" + value: "beam_width" + } + input_map { + key: "streaming" + value: "stream" + } + output_map { + key: "output_ids" + value: "_TOKENS_BATCH" + } + }, + { + model_name: "postprocessing" + model_version: -1 + input_map { + key: "TOKENS_BATCH" + value: "_TOKENS_BATCH" + } + output_map { + key: "OUTPUT" + value: "text_output" + } + } + ] +} diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/1/model.py b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/1/model.py new file mode 100644 index 00000000..85209b43 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/1/model.py @@ -0,0 +1,180 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os +from collections import OrderedDict + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + model_config = json.loads(args["model_config"]) + tokenizer_dir = os.environ["triton_tokenizer_repository"] + tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"] + + if tokenizer_type == "t5": + self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, padding_side="left") + elif tokenizer_type == "auto": + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, padding_side="left" + ) + elif tokenizer_type == "llama": + self.tokenizer = LlamaTokenizer.from_pretrained( + tokenizer_dir, legacy=False, padding_side="left" + ) + else: + raise AttributeError(f"Unexpected tokenizer type: {tokenizer_type}") + self.tokenizer.pad_token = self.tokenizer.eos_token + + # Parse model output configs + output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") + # Convert Triton types to numpy types + self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + + self.state_dict = OrderedDict() + # TODO(pankaj) This should come from the batch size + self.cache_size = 2048 + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get request ID + request_id = request.request_id() + + # Get input tensors + tokens_batch = ( + pb_utils.get_input_tensor_by_name(request, "TOKENS_BATCH") + .as_numpy() + .flatten() + ) + if len(tokens_batch) == 0: + continue + + # Postprocess output data + prev_token = self._get_prev_token(request_id) + self._store_prev_token(request_id, tokens_batch[-1]) + if prev_token is None: + delta = self.tokenizer.decode(tokens_batch) + else: + # TODO(pankaj) Figure out how to make tokenizer.decode not + # ignore initial whitespace so we can avoid this hack. + # Get string with and without previous token and diff. This hack + # is needed because tokenizer.decode strips initial whitespace. + old_string = self.tokenizer.decode([prev_token]) + with_prev_token = np.concatenate(([prev_token], tokens_batch)) + new_string = self.tokenizer.decode(with_prev_token) + delta = self._compute_delta(old_string, new_string) + + # Create output tensor + output_tensor = pb_utils.Tensor( + "OUTPUT", np.array([delta]).astype(self.output_dtype) + ) + inference_response = pb_utils.InferenceResponse( + output_tensors=[output_tensor] + ) + responses.append(inference_response) + + return responses + + def finalize(self): + print("Cleaning up...") + + def _store_prev_token(self, request_id, token): + if request_id in self.state_dict: + self.state_dict[request_id]["prev_token"] = token + + # Move request ID to end of queue to prevent it from being evicted + self.state_dict.move_to_end(request_id) + else: + # Evict least recently used item if cache is full + if len(self.state_dict) > self.cache_size: + self.state_dict.popitem(last=False) + + self.state_dict[request_id] = {"prev_token": token} + + def _get_prev_token(self, request_id): + if request_id in self.state_dict: + return self.state_dict[request_id]["prev_token"] + return None + + def _compute_delta(self, prev_str, new_str): + delta = "".join( + [ + char + for index, char in enumerate(new_str) + if index >= len(prev_str) or char != prev_str[index] + ] + ) + return delta + + def _postprocessing(self, tokens): + decoded_tokens = self.tokenizer.decode(tokens) + return decoded_tokens diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/config.pbtxt b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/config.pbtxt new file mode 100644 index 00000000..0abeaac8 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/config.pbtxt @@ -0,0 +1,64 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "postprocessing" +backend: "python" +max_batch_size: 128 +input [ + { + name: "TOKENS_BATCH" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + } +] +output [ + { + name: "OUTPUT" + data_type: TYPE_STRING + dims: [ -1, -1 ] + } +] + +parameters { + key: "tokenizer_dir" + value: { + string_value: "NousResearch/Llama-2-7b-hf" + } +} + +parameters { + key: "tokenizer_type" + value: { + string_value: "auto" + } +} + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/1/model.py b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/1/model.py new file mode 100644 index 00000000..db7b51d6 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/1/model.py @@ -0,0 +1,259 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import csv +import json +import os +from typing import List + +import numpy as np +import triton_python_backend_utils as pb_utils +from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + # Parse model configs + model_config = json.loads(args["model_config"]) + tokenizer_dir = os.environ["triton_tokenizer_repository"] + tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"] + self.add_special_tokens = model_config["parameters"].get( + "add_special_tokens", {"string_value": "false"} + )["string_value"].lower() in ["true", "1", "t", "y", "yes"] + + if tokenizer_type == "t5": + self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, padding_side="left") + elif tokenizer_type == "auto": + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_dir, padding_side="left" + ) + elif tokenizer_type == "llama": + self.tokenizer = LlamaTokenizer.from_pretrained( + tokenizer_dir, legacy=False, padding_side="left" + ) + else: + raise AttributeError(f"Unexpected tokenizer type: {tokenizer_type}") + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.pad_id = self.tokenizer.encode( + self.tokenizer.pad_token, add_special_tokens=False + )[0] + + # Parse model output configs and convert Triton types to numpy types + input_names = [ + "INPUT_ID", + "REQUEST_INPUT_LEN", + "BAD_WORDS_IDS", + "STOP_WORDS_IDS", + ] + for input_name in input_names: + setattr( + self, + input_name.lower() + "_dtype", + pb_utils.triton_string_to_numpy( + pb_utils.get_output_config_by_name(model_config, input_name)[ + "data_type" + ] + ), + ) + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse. + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for idx, request in enumerate(requests): + # Get input tensors + query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() + request_output_len = pb_utils.get_input_tensor_by_name( + request, "REQUEST_OUTPUT_LEN" + ).as_numpy() + + bad_words_dict = pb_utils.get_input_tensor_by_name( + request, "BAD_WORDS_DICT" + ).as_numpy() + stop_words_dict = pb_utils.get_input_tensor_by_name( + request, "STOP_WORDS_DICT" + ).as_numpy() + + # Preprocessing input data. + input_id, request_input_len = self._create_request(query) + bad_words = self._to_word_list_format(bad_words_dict) + stop_words = self._to_word_list_format(stop_words_dict) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + input_id_tensor = pb_utils.Tensor( + "INPUT_ID", np.array(input_id).astype(self.input_id_dtype) + ) + request_input_len_tensor = pb_utils.Tensor( + "REQUEST_INPUT_LEN", + np.array(request_input_len).astype(self.request_input_len_dtype), + ) + request_output_len_tensor = pb_utils.Tensor( + "REQUEST_OUTPUT_LEN", request_output_len + ) + bad_words_ids_tensor = pb_utils.Tensor("BAD_WORDS_IDS", bad_words) + stop_words_ids_tensor = pb_utils.Tensor("STOP_WORDS_IDS", stop_words) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[ + input_id_tensor, + bad_words_ids_tensor, + stop_words_ids_tensor, + request_input_len_tensor, + request_output_len_tensor, + ] + ) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...") + + def _create_request(self, query): + """ + query : batch string (2D numpy array) + """ + start_ids = [ + np.array( + self.tokenizer.encode( + s[0].decode(), add_special_tokens=self.add_special_tokens + ) + ).astype(int) + for s in query + ] + start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) + + max_len = 0 + for seq in start_ids: + max_len = max(max_len, seq.shape[0]) + start_ids = np.stack( + [ + np.pad( + seq, + (0, max_len - seq.shape[0]), + "constant", + constant_values=(0, self.pad_id), + ) + for seq in start_ids + ] + ) + + return start_ids, start_lengths + + def _to_word_list_format(self, word_dict: List[List[str]]): + """ + format of word_dict + len(word_dict) should be same to batch_size + word_dict[i] means the words for batch i + len(word_dict[i]) must be 1, which means it only contains 1 string + This string can contains several sentences and split by ",". + For example, if word_dict[2] = " I am happy, I am sad", then this function will return + the ids for two short sentences " I am happy" and " I am sad". + """ + assert self.tokenizer != None, "need to set tokenizer" + + flat_ids = [] + offsets = [] + for word_dict_item in word_dict: + item_flat_ids = [] + item_offsets = [] + + if isinstance(word_dict_item[0], bytes): + word_dict_item = [word_dict_item[0].decode()] + + words = list(csv.reader(word_dict_item))[0] + for word in words: + ids = self.tokenizer.encode(word) + + if len(ids) == 0: + continue + + item_flat_ids += ids + item_offsets.append(len(ids)) + + flat_ids.append(np.array(item_flat_ids)) + offsets.append(np.cumsum(np.array(item_offsets))) + + pad_to = max(1, max(len(ids) for ids in flat_ids)) + + for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): + flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) + offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) + + return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/config.pbtxt b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/config.pbtxt new file mode 100644 index 00000000..f9b150dd --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/config.pbtxt @@ -0,0 +1,99 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "preprocessing" +backend: "python" +max_batch_size: 128 +input [ + { + name: "QUERY" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "BAD_WORDS_DICT" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "STOP_WORDS_DICT" + data_type: TYPE_STRING + dims: [ -1 ] + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_UINT32 + dims: [ -1 ] + } +] +output [ + { + name: "INPUT_ID" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "REQUEST_INPUT_LEN" + data_type: TYPE_INT32 + dims: [ 1 ] + }, + { + name: "BAD_WORDS_IDS" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + }, + { + name: "STOP_WORDS_IDS" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + }, + { + name: "REQUEST_OUTPUT_LEN" + data_type: TYPE_UINT32 + dims: [ -1 ] + } +] + +parameters { + key: "tokenizer_dir" + value: { + string_value: "NousResearch/Llama-2-7b-hf" + } +} + +parameters { + key: "tokenizer_type" + value: { + string_value: "auto" + } +} + +instance_group [ + { + count: 1 + kind: KIND_CPU + } +] diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt new file mode 100644 index 00000000..c12796c1 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt @@ -0,0 +1,208 @@ +# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "tensorrt_llm" +backend: "tensorrtllm" +max_batch_size: 2048 + +model_transaction_policy { + decoupled: True +} + +input [ + { + name: "input_ids" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "input_lengths" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_UINT32 + dims: [ 1 ] + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "pad_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "min_length" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "presence_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_UINT64 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + }, + { + name: "streaming" + data_type: TYPE_BOOL + dims: [ 1 ] + optional: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_INT32 + dims: [ -1, -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +parameters: { + key: "max_beam_width" + value: { + string_value: "1" + } +} +parameters: { + key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: { + string_value: "no" + } +} +parameters: { + key: "gpt_model_type" + value: { + string_value: "inflight_fused_batching" + } +} +parameters: { + key: "gpt_model_path" + value: { + string_value: "/packages/inflight_batcher_llm/tensorrt_llm/1/" + } +} +parameters: { + key: "max_tokens_in_paged_kv_cache" + value: { + string_value: "100000" + } +} +parameters: { + key: "batch_scheduler_policy" + value: { + string_value: "max_utilization" + } +} +parameters: { + key: "kv_cache_free_gpu_mem_fraction" + value: { + string_value: "0.9" + } +} +parameters: { + key: "max_num_sequences" + value: { + string_value: "2048" + } +} +parameters: { + key: "enable_trt_overlap" + value: { + string_value: "False" + } +} diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/utils.py b/llama/llama-2-7b-trt-llm-2tp/packages/utils.py new file mode 100644 index 00000000..52afe988 --- /dev/null +++ b/llama/llama-2-7b-trt-llm-2tp/packages/utils.py @@ -0,0 +1,73 @@ +import socket +from pathlib import Path + +import tritonclient.grpc as grpcclient +from huggingface_hub import snapshot_download +from tritonclient.utils import np_to_triton_dtype + +GRPC_SERVICE_PORT = 8001 +HTTP_SERVICE_PORT = 8003 + + +def move_all_files(src: Path, dest: Path): + """ + Moves all files from `src` to `dest` recursively. + """ + for item in src.iterdir(): + dest_item = dest / item.name + if item.is_dir(): + dest_item.mkdir(parents=True, exist_ok=True) + move_all_files(item, dest_item) + else: + item.rename(dest_item) + + +def prepare_model_repository(data_dir: Path): + """ + Moves all files from `data_dir` to the model repository directory. + """ + # Ensure the destination directory exists + dest_dir = Path("/packages/inflight_batcher_llm/tensorrt_llm/1") + dest_dir.mkdir(parents=True, exist_ok=True) + + # Ensure empty version directory for `ensemble` model exists + ensemble_dir = Path("/packages/inflight_batcher_llm/ensemble/1") + ensemble_dir.mkdir(parents=True, exist_ok=True) + + # Move all files and directories from data_dir to dest_dir + move_all_files(data_dir, dest_dir) + + +def prepare_grpc_tensor(name, input): + t = grpcclient.InferInput(name, input.shape, np_to_triton_dtype(input.dtype)) + t.set_data_from_numpy(input) + return t + + +def download_engine(engine_repository: str, fp: Path, auth_token=None): + """ + Downloads the specified engine from Hugging Face Hub. + """ + snapshot_download( + engine_repository, + local_dir=fp, + local_dir_use_symlinks=False, + max_workers=4, + **({"use_auth_token": auth_token} if auth_token is not None else {}), + ) + + +def server_loaded(): + def port_is_available(port): + available = False + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + try: + sock.bind(("0.0.0.0", port)) + available = True + except: + pass + return available + + return not port_is_available(GRPC_SERVICE_PORT) or not port_is_available( + HTTP_SERVICE_PORT + ) From f40137a1dea1c96254306d04e1ae11263e9f480f Mon Sep 17 00:00:00 2001 From: Vlad Shulman Date: Thu, 4 Apr 2024 18:38:11 -0700 Subject: [PATCH 2/3] bumping mistral version --- mistral/mistral-7b-chat/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mistral/mistral-7b-chat/config.yaml b/mistral/mistral-7b-chat/config.yaml index 6acd401c..ef3d6a59 100644 --- a/mistral/mistral-7b-chat/config.yaml +++ b/mistral/mistral-7b-chat/config.yaml @@ -6,7 +6,7 @@ model_cache: - '*.json' - '*.safetensors' - '*.model' - repo_id: mistralai/Mistral-7B-Instruct-v0.1 + repo_id: mistralai/Mistral-7B-Instruct-v0.2 model_metadata: avatar_url: https://cdn.baseten.co/production/static/explore/mistral_logo.png cover_image_url: https://cdn.baseten.co/production/static/explore/mistral.png @@ -14,7 +14,7 @@ model_metadata: messages: - content: What is the mistral wind? role: user - model: mistralai/Mistral-7B-Instruct-v0.1 + model: mistralai/Mistral-7B-Instruct-v0.2 pretty_name: Mistral 7B Chat tags: - text-generation @@ -24,7 +24,7 @@ python_version: py311 requirements: - sentencepiece - accelerate -- transformers==4.34.0 +- transformers==4.38.1 - torch==2.0.1 - hf_transfer==0.1.4 resources: From 6f12368a76c0c47a2b216d43b04d23e992638025 Mon Sep 17 00:00:00 2001 From: Vlad Shulman Date: Thu, 4 Apr 2024 18:56:52 -0700 Subject: [PATCH 3/3] undoing llama change --- llama/llama-2-7b-trt-llm-2tp/README.md | 68 ----- .../llama-2-7b-trt-llm-2tp/TRT-LLM-README.md | 91 ------ llama/llama-2-7b-trt-llm-2tp/config.yaml | 38 --- .../data/.gitattributes | 37 --- .../llama-2-7b-trt-llm-2tp/model/__init__.py | 0 llama/llama-2-7b-trt-llm-2tp/model/model.py | 139 ---------- .../llama-2-7b-trt-llm-2tp/packages/client.py | 155 ----------- .../ensemble/config.pbtxt | 246 ----------------- .../postprocessing/1/model.py | 180 ------------ .../postprocessing/config.pbtxt | 64 ----- .../preprocessing/1/model.py | 259 ------------------ .../preprocessing/config.pbtxt | 99 ------- .../tensorrt_llm/config.pbtxt | 208 -------------- .../llama-2-7b-trt-llm-2tp/packages/utils.py | 73 ----- 14 files changed, 1657 deletions(-) delete mode 100644 llama/llama-2-7b-trt-llm-2tp/README.md delete mode 100644 llama/llama-2-7b-trt-llm-2tp/TRT-LLM-README.md delete mode 100644 llama/llama-2-7b-trt-llm-2tp/config.yaml delete mode 100644 llama/llama-2-7b-trt-llm-2tp/data/.gitattributes delete mode 100644 llama/llama-2-7b-trt-llm-2tp/model/__init__.py delete mode 100644 llama/llama-2-7b-trt-llm-2tp/model/model.py delete mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/client.py delete mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/ensemble/config.pbtxt delete mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/1/model.py delete mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/config.pbtxt delete mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/1/model.py delete mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/config.pbtxt delete mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt delete mode 100644 llama/llama-2-7b-trt-llm-2tp/packages/utils.py diff --git a/llama/llama-2-7b-trt-llm-2tp/README.md b/llama/llama-2-7b-trt-llm-2tp/README.md deleted file mode 100644 index 97cdc88a..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/README.md +++ /dev/null @@ -1,68 +0,0 @@ -# LLaMA2-7B-Chat Truss - -This is a [Truss](https://truss.baseten.co/) for an fp8 TRT 2TP version of LLaMA2-7B-Chat. Llama is a family of language models released by Meta. This README will walk you through how to deploy this Truss on Baseten to get your own instance of LLaMA2-7B-Chat. - -**Warning: This example is only intended for usage on a single A100, changing your resource type for this deployment will result in unsupported behavior** - -## Truss - -Truss is an open-source model serving framework developed by Baseten. It allows you to develop and deploy machine learning models onto Baseten. Using Truss, you can develop a GPU model using [live-reload](https://baseten.co/blog/technical-deep-dive-truss-live-reload), package models and their associated code, create Docker containers and deploy on Baseten. - -## Deploying LLaMA2-7B-Chat - -First, clone this repository: - -```sh -git clone https://github.com/basetenlabs/truss-examples/ -cd llama/llama-2-7b-trt-llm -``` - -Before deployment: - -1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). -2. Install the latest version of Truss: `pip install --upgrade truss` - -With `llama-2-7b-trt-llm` as your working directory, you can deploy the model with: - -```sh -truss push --publish -``` - -Paste your Baseten API key if prompted. - -For more information, see [Truss documentation](https://truss.baseten.co). - -## LLaMA2-7B API documentation - -This section provides an overview of the LLaMA2-7B API, its parameters, and how to use it. The API consists of a single route named `predict`, which you can invoke to generate text based on the provided instruction. - -### API route: `predict` - -We expect requests will the following information: - -- `prompt` (str): The prompt you'd like to complete -- `max_tokens` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt. -- `beam_width` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. -- `bad_words_list` (list, default:[]): A list of words to not include in generated output. -- `stop_words_list` (list, default:[]): A list of words to stop generation upon encountering. -- `repetition_penalty` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. - -This Truss will stream responses back. Responses will be buffered chunks of text. - -## Example usage - -```sh -truss predict -d '{"prompt": "What is the meaning of life?"}' -``` - -You can also invoke your model via a REST API - -```sh -curl -X POST " https://app.baseten.co/models/YOUR_MODEL_ID/predict" \ - -H "Content-Type: application/json" \ - -H 'Authorization: Api-Key {YOUR_API_KEY}' \ - -d '{ - "prompt": "What's the meaning of life?", - }' - -``` diff --git a/llama/llama-2-7b-trt-llm-2tp/TRT-LLM-README.md b/llama/llama-2-7b-trt-llm-2tp/TRT-LLM-README.md deleted file mode 100644 index 981f2050..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/TRT-LLM-README.md +++ /dev/null @@ -1,91 +0,0 @@ - - -# TRTLLM - -### Overview -This Truss adds support for TRT-LLM engines via Triton Inference Server. TRT-LLM is a highly-performant language model runtime. We leverage the C++ runtime to take advantage of in-flight batching (aka continous batching). - -### Prerequisites - -To use this Truss, your engine must be built with in-flight batching support. Refer to your architecture-specific `build.py` re: how to build with in-flight-batching support. - -### Config - -This Truss is primarily config driven. This means that most settings you'll need to edit are located in the `config.yaml`. These settings are all located underneath the `model_metadata` key. - -- `tensor_parallelism` (int): If you built your model with tensor parallelism support, you'll need to set this value with the same value used during the build engine step. This value should be the same as the number of GPUs in the `resources` section. - -*Pipeline parallelism is not supported in this version but will be added later. As noted from Nvidia, pipeline parallelism reduces the need for high-bandwidth communication but may incur load-balancing issues and may be less efficient in terms of GPU utilization.* - -- `engine_repository` (str): We expect engines to be uploaded to Huggingface with a flat directory structure (i.e the engine and associated files are not underneath a folder structure). This value is the full `{org_name}/{repo_name}` string. Engines can be private or public. - -- `tokenizer_repository` (str): Engines do not come bundled with their own tokenizer. This is the Huggingface repository where we can find a tokenizer. Tokenizers can be private or public. - -If the engine and repository tokenizers are private, you'll need to update the `secrets` section of the `config.yaml` as follows: - -``` -secrets: - hf_access_token: "my_hf_api_key" -``` - -### Performance - -TRT-LLM engines are designed to be highly performant. Once your Truss has been deployed, you may find that you're not fully utilizing the GPU. The following are levers to improve performance but require trial-and-error to identify appropriates. All of these values live inside the `config.pbtxt` for a given ensemble model. - -#### Preprocessing / Postprocessing - -``` -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] -``` -By default, we load 1 instance of the pre/post models. If you find that the tokenizer is a bottleneck, increasing the `count` variable here will load more replicas of these models and Triton will automatically load balance across model instances. - -### Tensorrt LLM -``` -parameters: { - key: "max_tokens_in_paged_kv_cache" - value: { - string_value: "10000" - } -} -``` -By default, we set the `max_tokens_in_paged_kv_cache` to 10000. For a 7B model on 1 A100 with a batch size of 8, we have over 60GB of GPU memory left over. We can increase this value to 100k comfortably and allow for more tokens in the KV cache. Your mileage will vary based on the size of your model and the hardware you're running on. - -``` -parameters: { - key: "kv_cache_free_gpu_mem_fraction" - value: { - string_value: "0.1" - } -} -``` -By default, if `max_tokens_in_paged_kv_cache` is unset, Triton Inference Server will attempt to preallocate `kv_cache_free_gpu_mem_fraction` fraction of free gpu memory for the KV cache. - -``` -parameters: { - key: "max_num_sequences" - value: { - string_value: "64" - } -} -``` -The `max_num_sequences` param is the maximum numbers of requests that the inference server can maintain state for at a given time (state = KV cache + decoder state). -See this [comment](https://github.com/NVIDIA/TensorRT-LLM/issues/65#issuecomment-1774332446) for more details. Setting this value higher allows for more parallel processing but uses more GPU memory. - -### API - -We expect requests will the following information: - - -- ```prompt``` (str): The prompt you'd like to complete -- ```max_tokens``` (int, default: 50): The max token count. This includes the number of tokens in your prompt so if this value is less than your prompt, you'll just recieve a truncated version of the prompt. -- ```beam_width``` (int, default:50): The number of beams to compute. This must be 1 for this version of TRT-LLM. Inflight-batching does not support beams > 1. -- ```bad_words_list``` (list, default:[]): A list of words to not include in generated output. -- ```stop_words_list``` (list, default:[]): A list of words to stop generation upon encountering. -- ```repetition_penalty``` (float, defualt: 1.0): A repetition penalty to incentivize not repeating tokens. - -This Truss will stream responses back. Responses will be buffered chunks of text. diff --git a/llama/llama-2-7b-trt-llm-2tp/config.yaml b/llama/llama-2-7b-trt-llm-2tp/config.yaml deleted file mode 100644 index 06369d60..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/config.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apply_library_patches: true -base_image: - image: docker.io/baseten/trtllm-server:r23.12_baseten_v0.9.0.dev2024022000 - python_executable_path: /usr/bin/python3 -description: Generate text from a prompt with this seven billion parameter language - model. -build: - arguments: - engine_repository: strangervb/mistral_fp8_i100_o400_tp2_v0 - pipeline_parallel_count: 1 - tensor_parallel_count: 2 - tokenizer_repository: NousResearch/Llama-2-7b-chat-hf -environment_variables: {} -external_package_dirs: [] -model_metadata: - avatar_url: https://cdn.baseten.co/production/static/explore/meta.png - cover_image_url: https://cdn.baseten.co/production/static/explore/llama.png - engine_repository: strangervb/mistral_fp8_i100_o400_tp2_v0 - example_model_input: - max_tokens: 100 - prompt: What's the meaning of life? - repo_id: NousResearch/Llama-2-7b-chat-hf - tags: - - text-generation - tensor_parallelism: 2 - tokenizer_repository: NousResearch/Llama-2-7b-chat-hf -model_name: Llama 7B Chat TRT 2TP -python_version: py311 -requirements: -- tritonclient[all] -- transformers -- jinja2 -resources: - accelerator: H100:2 - use_gpu: true -runtime: - predict_concurrency: 4 -secrets: {} diff --git a/llama/llama-2-7b-trt-llm-2tp/data/.gitattributes b/llama/llama-2-7b-trt-llm-2tp/data/.gitattributes deleted file mode 100644 index 728629f3..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/data/.gitattributes +++ /dev/null @@ -1,37 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -gpt_float16_tp2_rank0.engine filter=lfs diff=lfs merge=lfs -text -gpt_float16_tp2_rank1.engine filter=lfs diff=lfs merge=lfs -text diff --git a/llama/llama-2-7b-trt-llm-2tp/model/__init__.py b/llama/llama-2-7b-trt-llm-2tp/model/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/llama/llama-2-7b-trt-llm-2tp/model/model.py b/llama/llama-2-7b-trt-llm-2tp/model/model.py deleted file mode 100644 index a1306df1..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/model/model.py +++ /dev/null @@ -1,139 +0,0 @@ -import os -from itertools import count -from pathlib import Path -from threading import Thread - -import numpy as np -from client import TritonClient, UserData -from transformers import AutoTokenizer -from utils import download_engine, prepare_grpc_tensor, server_loaded - -TRITON_MODEL_REPOSITORY_PATH = Path("/packages/inflight_batcher_llm/") - - -class Model: - def __init__(self, **kwargs): - self._data_dir = kwargs["data_dir"] - self._config = kwargs["config"] - self._secrets = kwargs["secrets"] - self._request_id_counter = count(start=1) - self.triton_client = None - self.tokenizer = None - self.uses_openai_api = ( - "openai-compatible" in self._config["model_metadata"]["tags"] - ) - - def load(self): - tensor_parallel_count = self._config["model_metadata"].get( - "tensor_parallelism", 1 - ) - pipeline_parallel_count = self._config["model_metadata"].get( - "pipeline_parallelism", 1 - ) - if "hf_access_token" in self._secrets._base_secrets.keys(): - hf_access_token = self._secrets["hf_access_token"] - else: - hf_access_token = None - is_external_engine_repo = "engine_repository" in self._config["model_metadata"] - - # Instantiate TritonClient - self.triton_client = TritonClient( - data_dir=self._data_dir, - model_repository_dir=TRITON_MODEL_REPOSITORY_PATH, - parallel_count=tensor_parallel_count * pipeline_parallel_count, - ) - - # Download model from Hugging Face Hub if specified - if is_external_engine_repo: - if not server_loaded(): - download_engine( - engine_repository=self._config["model_metadata"][ - "engine_repository" - ], - fp=self._data_dir, - auth_token=hf_access_token, - ) - - # Load Triton Server and model - tokenizer_repository = self._config["model_metadata"]["tokenizer_repository"] - env = {"triton_tokenizer_repository": tokenizer_repository} - if hf_access_token is not None: - env["HUGGING_FACE_HUB_TOKEN"] = hf_access_token - - self.triton_client.load_server_and_model(env=env) - - # setup eos token - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_repository, token=hf_access_token - ) - self.eos_token_id = self.tokenizer.eos_token_id - - def predict(self, model_input): - user_data = UserData() - model_name = "ensemble" - stream_uuid = str(os.getpid()) + str(next(self._request_id_counter)) - - if self.uses_openai_api: - prompt = self.tokenizer.apply_chat_template( - model_input.get("messages"), - tokenize=False, - ) - else: - prompt = model_input.get("prompt") - - max_tokens = model_input.get("max_tokens", 50) - beam_width = model_input.get("beam_width", 1) - bad_words_list = model_input.get("bad_words_list", [""]) - stop_words_list = model_input.get("stop_words_list", [""]) - repetition_penalty = model_input.get("repetition_penalty", 1.0) - ignore_eos = model_input.get("ignore_eos", False) - stream = model_input.get("stream", True) - - input0 = [[prompt]] - input0_data = np.array(input0).astype(object) - output0_len = np.ones_like(input0).astype(np.uint32) * max_tokens - bad_words_list = np.array([bad_words_list], dtype=object) - stop_words_list = np.array([stop_words_list], dtype=object) - stream_data = np.array([[stream]], dtype=bool) - beam_width_data = np.array([[beam_width]], dtype=np.uint32) - repetition_penalty_data = np.array([[repetition_penalty]], dtype=np.float32) - - inputs = [ - prepare_grpc_tensor("text_input", input0_data), - prepare_grpc_tensor("max_tokens", output0_len), - prepare_grpc_tensor("bad_words", bad_words_list), - prepare_grpc_tensor("stop_words", stop_words_list), - prepare_grpc_tensor("stream", stream_data), - prepare_grpc_tensor("beam_width", beam_width_data), - prepare_grpc_tensor("repetition_penalty", repetition_penalty_data), - ] - - if not ignore_eos: - end_id_data = np.array([[self.eos_token_id]], dtype=np.uint32) - inputs.append(prepare_grpc_tensor("end_id", end_id_data)) - else: - # do nothing, trt-llm by default doesn't stop on `eos` - pass - - # Start GRPC stream in a separate thread - stream_thread = Thread( - target=self.triton_client.start_grpc_stream, - args=(user_data, model_name, inputs, stream_uuid), - ) - stream_thread.start() - - def generate(): - # Yield results from the queue - for i in TritonClient.stream_predict(user_data): - yield i - - # Clean up GRPC stream and thread - self.triton_client.stop_grpc_stream(stream_uuid, stream_thread) - - if stream: - return generate() - else: - if self.uses_openai_api: - return "".join(generate()) - else: - return {"text": "".join(generate())} diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/client.py b/llama/llama-2-7b-trt-llm-2tp/packages/client.py deleted file mode 100644 index a996b074..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/packages/client.py +++ /dev/null @@ -1,155 +0,0 @@ -import json -import os -import subprocess -import time -from functools import partial -from pathlib import Path -from queue import Queue -from threading import Thread - -import tritonclient.grpc as grpcclient -import tritonclient.http as httpclient -from tritonclient.utils import InferenceServerException -from utils import ( - GRPC_SERVICE_PORT, - HTTP_SERVICE_PORT, - prepare_model_repository, - server_loaded, -) - - -class UserData: - def __init__(self): - self._completed_requests = Queue() - - -def callback(user_data, result, error): - if error: - user_data._completed_requests.put(error) - else: - user_data._completed_requests.put(result) - - -class TritonClient: - def __init__(self, data_dir: Path, model_repository_dir: Path, parallel_count=1): - self._data_dir = data_dir - self._model_repository_dir = model_repository_dir - self._parallel_count = parallel_count - self._http_client = None - self._grpc_client_map = {} - - def start_grpc_stream(self, user_data, model_name, inputs, stream_uuid): - """Starts a GRPC stream and sends a request to the Triton server.""" - grpc_client_instance = grpcclient.InferenceServerClient( - url=f"localhost:{GRPC_SERVICE_PORT}", verbose=False - ) - self._grpc_client_map[stream_uuid] = grpc_client_instance - grpc_client_instance.start_stream(callback=partial(callback, user_data)) - grpc_client_instance.async_stream_infer( - model_name, - inputs, - request_id=stream_uuid, - enable_empty_final_response=True, - ) - - def stop_grpc_stream(self, stream_uuid, stream_thread: Thread): - """Closes a GRPC stream and stops the associated thread.""" - triton_grpc_stream = self._grpc_client_map[stream_uuid] - triton_grpc_stream.stop_stream() - stream_thread.join() - del self._grpc_client_map[stream_uuid] - - def start_server( - self, - mpi: int = 1, - env: dict = {}, - ): - """Triton Inference Server has different startup commands depending on - whether it is running in a TP=1 or TP>1 configuration. This function - starts the server with the appropriate command.""" - if mpi == 1: - command = [ - "tritonserver", - "--model-repository", - str(self._model_repository_dir), - "--grpc-port", - f"{GRPC_SERVICE_PORT}", - "--http-port", - f"{HTTP_SERVICE_PORT}", - ] - command = [ - "mpirun", - "--allow-run-as-root", - ] - for i in range(mpi): - command += [ - "-n", - "1", - "tritonserver", - "--model-repository", - str(self._model_repository_dir), - "--grpc-port", - f"{GRPC_SERVICE_PORT}", - "--http-port", - f"{HTTP_SERVICE_PORT}", - "--disable-auto-complete-config", - f"--backend-config=python,shm-region-prefix-name=prefix{str(i)}_", - ":", - ] - return subprocess.Popen( - command, - env={**os.environ, **env}, - ) - - def load_server_and_model(self, env: dict): - """Loads the Triton server and the model.""" - if not server_loaded(): - prepare_model_repository(self._data_dir) - self.start_server(mpi=self._parallel_count, env=env) - - self._http_client = httpclient.InferenceServerClient( - url=f"localhost:{HTTP_SERVICE_PORT}", verbose=False - ) - is_server_up = False - while not is_server_up: - try: - is_server_up = self._http_client.is_server_live() - except ConnectionRefusedError: - time.sleep(2) - continue - - while self._http_client.is_model_ready(model_name="ensemble") == False: - time.sleep(2) - continue - - @staticmethod - def stream_predict(user_data: UserData): - """Static method to yield predictions or errors based on input and a streaming user_data queue.""" - - def _is_final_response(result): - """Check if the given result is a final response according to Triton's specification.""" - if isinstance(result, InferenceServerException): - return True - - if result: - final_response_param = result.get_response().parameters.get( - "triton_final_response" - ) - return ( - final_response_param.bool_param if final_response_param else False - ) - return False - - result = None - - while not _is_final_response(result): - try: - result = user_data._completed_requests.get() - if not isinstance(result, InferenceServerException): - res = result.as_numpy("text_output") - yield res[0].decode("utf-8") - else: - yield json.dumps({"status": "error", "message": result.message()}) - except Exception as e: - yield json.dumps({"status": "error", "message": str(e)}) - break diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/ensemble/config.pbtxt b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/ensemble/config.pbtxt deleted file mode 100644 index 01131a7e..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/ensemble/config.pbtxt +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "ensemble" -platform: "ensemble" -max_batch_size: 128 -input [ - { - name: "text_input" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "max_tokens" - data_type: TYPE_UINT32 - dims: [ -1 ] - }, - { - name: "bad_words" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "stop_words" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "end_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "pad_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_k" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "length_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "min_length" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - optional: true - }, - { - name: "beam_width" - data_type: TYPE_UINT32 - dims: [ 1 ] - optional: true - }, - { - name: "stream" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - } -] -output [ - { - name: "text_output" - data_type: TYPE_STRING - dims: [ -1, -1 ] - } -] -ensemble_scheduling { - step [ - { - model_name: "preprocessing" - model_version: -1 - input_map { - key: "QUERY" - value: "text_input" - } - input_map { - key: "REQUEST_OUTPUT_LEN" - value: "max_tokens" - } - input_map { - key: "BAD_WORDS_DICT" - value: "bad_words" - } - input_map { - key: "STOP_WORDS_DICT" - value: "stop_words" - } - output_map { - key: "REQUEST_INPUT_LEN" - value: "_REQUEST_INPUT_LEN" - } - output_map { - key: "INPUT_ID" - value: "_INPUT_ID" - } - output_map { - key: "REQUEST_OUTPUT_LEN" - value: "_REQUEST_OUTPUT_LEN" - } - }, - { - model_name: "tensorrt_llm" - model_version: -1 - input_map { - key: "input_ids" - value: "_INPUT_ID" - } - input_map { - key: "input_lengths" - value: "_REQUEST_INPUT_LEN" - } - input_map { - key: "request_output_len" - value: "_REQUEST_OUTPUT_LEN" - } - input_map { - key: "end_id" - value: "end_id" - } - input_map { - key: "pad_id" - value: "pad_id" - } - input_map { - key: "runtime_top_k" - value: "top_k" - } - input_map { - key: "runtime_top_p" - value: "top_p" - } - input_map { - key: "temperature" - value: "temperature" - } - input_map { - key: "len_penalty" - value: "length_penalty" - } - input_map { - key: "repetition_penalty" - value: "repetition_penalty" - } - input_map { - key: "min_length" - value: "min_length" - } - input_map { - key: "presence_penalty" - value: "presence_penalty" - } - input_map { - key: "random_seed" - value: "random_seed" - } - input_map { - key: "beam_width" - value: "beam_width" - } - input_map { - key: "streaming" - value: "stream" - } - output_map { - key: "output_ids" - value: "_TOKENS_BATCH" - } - }, - { - model_name: "postprocessing" - model_version: -1 - input_map { - key: "TOKENS_BATCH" - value: "_TOKENS_BATCH" - } - output_map { - key: "OUTPUT" - value: "text_output" - } - } - ] -} diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/1/model.py b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/1/model.py deleted file mode 100644 index 85209b43..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/1/model.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import os -from collections import OrderedDict - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args["model_config"]) - tokenizer_dir = os.environ["triton_tokenizer_repository"] - tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"] - - if tokenizer_type == "t5": - self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, padding_side="left") - elif tokenizer_type == "auto": - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_dir, padding_side="left" - ) - elif tokenizer_type == "llama": - self.tokenizer = LlamaTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side="left" - ) - else: - raise AttributeError(f"Unexpected tokenizer type: {tokenizer_type}") - self.tokenizer.pad_token = self.tokenizer.eos_token - - # Parse model output configs - output_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT") - # Convert Triton types to numpy types - self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) - - self.state_dict = OrderedDict() - # TODO(pankaj) This should come from the batch size - self.cache_size = 2048 - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get request ID - request_id = request.request_id() - - # Get input tensors - tokens_batch = ( - pb_utils.get_input_tensor_by_name(request, "TOKENS_BATCH") - .as_numpy() - .flatten() - ) - if len(tokens_batch) == 0: - continue - - # Postprocess output data - prev_token = self._get_prev_token(request_id) - self._store_prev_token(request_id, tokens_batch[-1]) - if prev_token is None: - delta = self.tokenizer.decode(tokens_batch) - else: - # TODO(pankaj) Figure out how to make tokenizer.decode not - # ignore initial whitespace so we can avoid this hack. - # Get string with and without previous token and diff. This hack - # is needed because tokenizer.decode strips initial whitespace. - old_string = self.tokenizer.decode([prev_token]) - with_prev_token = np.concatenate(([prev_token], tokens_batch)) - new_string = self.tokenizer.decode(with_prev_token) - delta = self._compute_delta(old_string, new_string) - - # Create output tensor - output_tensor = pb_utils.Tensor( - "OUTPUT", np.array([delta]).astype(self.output_dtype) - ) - inference_response = pb_utils.InferenceResponse( - output_tensors=[output_tensor] - ) - responses.append(inference_response) - - return responses - - def finalize(self): - print("Cleaning up...") - - def _store_prev_token(self, request_id, token): - if request_id in self.state_dict: - self.state_dict[request_id]["prev_token"] = token - - # Move request ID to end of queue to prevent it from being evicted - self.state_dict.move_to_end(request_id) - else: - # Evict least recently used item if cache is full - if len(self.state_dict) > self.cache_size: - self.state_dict.popitem(last=False) - - self.state_dict[request_id] = {"prev_token": token} - - def _get_prev_token(self, request_id): - if request_id in self.state_dict: - return self.state_dict[request_id]["prev_token"] - return None - - def _compute_delta(self, prev_str, new_str): - delta = "".join( - [ - char - for index, char in enumerate(new_str) - if index >= len(prev_str) or char != prev_str[index] - ] - ) - return delta - - def _postprocessing(self, tokens): - decoded_tokens = self.tokenizer.decode(tokens) - return decoded_tokens diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/config.pbtxt b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/config.pbtxt deleted file mode 100644 index 0abeaac8..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/postprocessing/config.pbtxt +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "postprocessing" -backend: "python" -max_batch_size: 128 -input [ - { - name: "TOKENS_BATCH" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - } -] -output [ - { - name: "OUTPUT" - data_type: TYPE_STRING - dims: [ -1, -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "NousResearch/Llama-2-7b-hf" - } -} - -parameters { - key: "tokenizer_type" - value: { - string_value: "auto" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/1/model.py b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/1/model.py deleted file mode 100644 index db7b51d6..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/1/model.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import csv -import json -import os -from typing import List - -import numpy as np -import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer - - -class TritonPythonModel: - """Your Python model must use the same class name. Every Python model - that is created must have "TritonPythonModel" as the class name. - """ - - def initialize(self, args): - """`initialize` is called only once when the model is being loaded. - Implementing `initialize` function is optional. This function allows - the model to initialize any state associated with this model. - Parameters - ---------- - args : dict - Both keys and values are strings. The dictionary keys and values are: - * model_config: A JSON string containing the model configuration - * model_instance_kind: A string containing model instance kind - * model_instance_device_id: A string containing model instance device ID - * model_repository: Model repository path - * model_version: Model version - * model_name: Model name - """ - # Parse model configs - model_config = json.loads(args["model_config"]) - tokenizer_dir = os.environ["triton_tokenizer_repository"] - tokenizer_type = model_config["parameters"]["tokenizer_type"]["string_value"] - self.add_special_tokens = model_config["parameters"].get( - "add_special_tokens", {"string_value": "false"} - )["string_value"].lower() in ["true", "1", "t", "y", "yes"] - - if tokenizer_type == "t5": - self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir, padding_side="left") - elif tokenizer_type == "auto": - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_dir, padding_side="left" - ) - elif tokenizer_type == "llama": - self.tokenizer = LlamaTokenizer.from_pretrained( - tokenizer_dir, legacy=False, padding_side="left" - ) - else: - raise AttributeError(f"Unexpected tokenizer type: {tokenizer_type}") - self.tokenizer.pad_token = self.tokenizer.eos_token - - self.pad_id = self.tokenizer.encode( - self.tokenizer.pad_token, add_special_tokens=False - )[0] - - # Parse model output configs and convert Triton types to numpy types - input_names = [ - "INPUT_ID", - "REQUEST_INPUT_LEN", - "BAD_WORDS_IDS", - "STOP_WORDS_IDS", - ] - for input_name in input_names: - setattr( - self, - input_name.lower() + "_dtype", - pb_utils.triton_string_to_numpy( - pb_utils.get_output_config_by_name(model_config, input_name)[ - "data_type" - ] - ), - ) - - def execute(self, requests): - """`execute` must be implemented in every Python model. `execute` - function receives a list of pb_utils.InferenceRequest as the only - argument. This function is called when an inference is requested - for this model. Depending on the batching configuration (e.g. Dynamic - Batching) used, `requests` may contain multiple requests. Every - Python model, must create one pb_utils.InferenceResponse for every - pb_utils.InferenceRequest in `requests`. If there is an error, you can - set the error argument when creating a pb_utils.InferenceResponse. - Parameters - ---------- - requests : list - A list of pb_utils.InferenceRequest - Returns - ------- - list - A list of pb_utils.InferenceResponse. The length of this list must - be the same as `requests` - """ - - responses = [] - - # Every Python backend must iterate over everyone of the requests - # and create a pb_utils.InferenceResponse for each of them. - for idx, request in enumerate(requests): - # Get input tensors - query = pb_utils.get_input_tensor_by_name(request, "QUERY").as_numpy() - request_output_len = pb_utils.get_input_tensor_by_name( - request, "REQUEST_OUTPUT_LEN" - ).as_numpy() - - bad_words_dict = pb_utils.get_input_tensor_by_name( - request, "BAD_WORDS_DICT" - ).as_numpy() - stop_words_dict = pb_utils.get_input_tensor_by_name( - request, "STOP_WORDS_DICT" - ).as_numpy() - - # Preprocessing input data. - input_id, request_input_len = self._create_request(query) - bad_words = self._to_word_list_format(bad_words_dict) - stop_words = self._to_word_list_format(stop_words_dict) - - # Create output tensors. You need pb_utils.Tensor - # objects to create pb_utils.InferenceResponse. - input_id_tensor = pb_utils.Tensor( - "INPUT_ID", np.array(input_id).astype(self.input_id_dtype) - ) - request_input_len_tensor = pb_utils.Tensor( - "REQUEST_INPUT_LEN", - np.array(request_input_len).astype(self.request_input_len_dtype), - ) - request_output_len_tensor = pb_utils.Tensor( - "REQUEST_OUTPUT_LEN", request_output_len - ) - bad_words_ids_tensor = pb_utils.Tensor("BAD_WORDS_IDS", bad_words) - stop_words_ids_tensor = pb_utils.Tensor("STOP_WORDS_IDS", stop_words) - - # Create InferenceResponse. You can set an error here in case - # there was a problem with handling this inference request. - # Below is an example of how you can set errors in inference - # response: - # - # pb_utils.InferenceResponse( - # output_tensors=..., TritonError("An error occurred")) - inference_response = pb_utils.InferenceResponse( - output_tensors=[ - input_id_tensor, - bad_words_ids_tensor, - stop_words_ids_tensor, - request_input_len_tensor, - request_output_len_tensor, - ] - ) - responses.append(inference_response) - - # You should return a list of pb_utils.InferenceResponse. Length - # of this list must match the length of `requests` list. - return responses - - def finalize(self): - """`finalize` is called only once when the model is being unloaded. - Implementing `finalize` function is optional. This function allows - the model to perform any necessary clean ups before exit. - """ - print("Cleaning up...") - - def _create_request(self, query): - """ - query : batch string (2D numpy array) - """ - start_ids = [ - np.array( - self.tokenizer.encode( - s[0].decode(), add_special_tokens=self.add_special_tokens - ) - ).astype(int) - for s in query - ] - start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int) - - max_len = 0 - for seq in start_ids: - max_len = max(max_len, seq.shape[0]) - start_ids = np.stack( - [ - np.pad( - seq, - (0, max_len - seq.shape[0]), - "constant", - constant_values=(0, self.pad_id), - ) - for seq in start_ids - ] - ) - - return start_ids, start_lengths - - def _to_word_list_format(self, word_dict: List[List[str]]): - """ - format of word_dict - len(word_dict) should be same to batch_size - word_dict[i] means the words for batch i - len(word_dict[i]) must be 1, which means it only contains 1 string - This string can contains several sentences and split by ",". - For example, if word_dict[2] = " I am happy, I am sad", then this function will return - the ids for two short sentences " I am happy" and " I am sad". - """ - assert self.tokenizer != None, "need to set tokenizer" - - flat_ids = [] - offsets = [] - for word_dict_item in word_dict: - item_flat_ids = [] - item_offsets = [] - - if isinstance(word_dict_item[0], bytes): - word_dict_item = [word_dict_item[0].decode()] - - words = list(csv.reader(word_dict_item))[0] - for word in words: - ids = self.tokenizer.encode(word) - - if len(ids) == 0: - continue - - item_flat_ids += ids - item_offsets.append(len(ids)) - - flat_ids.append(np.array(item_flat_ids)) - offsets.append(np.cumsum(np.array(item_offsets))) - - pad_to = max(1, max(len(ids) for ids in flat_ids)) - - for i, (ids, offs) in enumerate(zip(flat_ids, offsets)): - flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0) - offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1) - - return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2)) diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/config.pbtxt b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/config.pbtxt deleted file mode 100644 index f9b150dd..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/preprocessing/config.pbtxt +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "preprocessing" -backend: "python" -max_batch_size: 128 -input [ - { - name: "QUERY" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "BAD_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "STOP_WORDS_DICT" - data_type: TYPE_STRING - dims: [ -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_UINT32 - dims: [ -1 ] - } -] -output [ - { - name: "INPUT_ID" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "REQUEST_INPUT_LEN" - data_type: TYPE_INT32 - dims: [ 1 ] - }, - { - name: "BAD_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "STOP_WORDS_IDS" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - }, - { - name: "REQUEST_OUTPUT_LEN" - data_type: TYPE_UINT32 - dims: [ -1 ] - } -] - -parameters { - key: "tokenizer_dir" - value: { - string_value: "NousResearch/Llama-2-7b-hf" - } -} - -parameters { - key: "tokenizer_type" - value: { - string_value: "auto" - } -} - -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt deleted file mode 100644 index c12796c1..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/packages/inflight_batcher_llm/tensorrt_llm/config.pbtxt +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -name: "tensorrt_llm" -backend: "tensorrtllm" -max_batch_size: 2048 - -model_transaction_policy { - decoupled: True -} - -input [ - { - name: "input_ids" - data_type: TYPE_INT32 - dims: [ -1 ] - }, - { - name: "input_lengths" - data_type: TYPE_INT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "request_output_len" - data_type: TYPE_UINT32 - dims: [ 1 ] - }, - { - name: "end_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "pad_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_k" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "len_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "min_length" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "presence_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "stop" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - }, - { - name: "streaming" - data_type: TYPE_BOOL - dims: [ 1 ] - optional: true - } -] -output [ - { - name: "output_ids" - data_type: TYPE_INT32 - dims: [ -1, -1 ] - } -] -instance_group [ - { - count: 1 - kind : KIND_CPU - } -] -parameters: { - key: "max_beam_width" - value: { - string_value: "1" - } -} -parameters: { - key: "FORCE_CPU_ONLY_INPUT_TENSORS" - value: { - string_value: "no" - } -} -parameters: { - key: "gpt_model_type" - value: { - string_value: "inflight_fused_batching" - } -} -parameters: { - key: "gpt_model_path" - value: { - string_value: "/packages/inflight_batcher_llm/tensorrt_llm/1/" - } -} -parameters: { - key: "max_tokens_in_paged_kv_cache" - value: { - string_value: "100000" - } -} -parameters: { - key: "batch_scheduler_policy" - value: { - string_value: "max_utilization" - } -} -parameters: { - key: "kv_cache_free_gpu_mem_fraction" - value: { - string_value: "0.9" - } -} -parameters: { - key: "max_num_sequences" - value: { - string_value: "2048" - } -} -parameters: { - key: "enable_trt_overlap" - value: { - string_value: "False" - } -} diff --git a/llama/llama-2-7b-trt-llm-2tp/packages/utils.py b/llama/llama-2-7b-trt-llm-2tp/packages/utils.py deleted file mode 100644 index 52afe988..00000000 --- a/llama/llama-2-7b-trt-llm-2tp/packages/utils.py +++ /dev/null @@ -1,73 +0,0 @@ -import socket -from pathlib import Path - -import tritonclient.grpc as grpcclient -from huggingface_hub import snapshot_download -from tritonclient.utils import np_to_triton_dtype - -GRPC_SERVICE_PORT = 8001 -HTTP_SERVICE_PORT = 8003 - - -def move_all_files(src: Path, dest: Path): - """ - Moves all files from `src` to `dest` recursively. - """ - for item in src.iterdir(): - dest_item = dest / item.name - if item.is_dir(): - dest_item.mkdir(parents=True, exist_ok=True) - move_all_files(item, dest_item) - else: - item.rename(dest_item) - - -def prepare_model_repository(data_dir: Path): - """ - Moves all files from `data_dir` to the model repository directory. - """ - # Ensure the destination directory exists - dest_dir = Path("/packages/inflight_batcher_llm/tensorrt_llm/1") - dest_dir.mkdir(parents=True, exist_ok=True) - - # Ensure empty version directory for `ensemble` model exists - ensemble_dir = Path("/packages/inflight_batcher_llm/ensemble/1") - ensemble_dir.mkdir(parents=True, exist_ok=True) - - # Move all files and directories from data_dir to dest_dir - move_all_files(data_dir, dest_dir) - - -def prepare_grpc_tensor(name, input): - t = grpcclient.InferInput(name, input.shape, np_to_triton_dtype(input.dtype)) - t.set_data_from_numpy(input) - return t - - -def download_engine(engine_repository: str, fp: Path, auth_token=None): - """ - Downloads the specified engine from Hugging Face Hub. - """ - snapshot_download( - engine_repository, - local_dir=fp, - local_dir_use_symlinks=False, - max_workers=4, - **({"use_auth_token": auth_token} if auth_token is not None else {}), - ) - - -def server_loaded(): - def port_is_available(port): - available = False - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - try: - sock.bind(("0.0.0.0", port)) - available = True - except: - pass - return available - - return not port_is_available(GRPC_SERVICE_PORT) or not port_is_available( - HTTP_SERVICE_PORT - )