diff --git a/internal/config.yaml b/internal/config.yaml new file mode 100644 index 00000000..59d73f91 --- /dev/null +++ b/internal/config.yaml @@ -0,0 +1,50 @@ + +model_metadata: + tags: + - openai-compatible +model_name: briton-spec-dec +python_version: py310 +requirements: [] +resources: + accelerator: A10G + cpu: '1' + memory: 24Gi + use_gpu: true +runtime: + predict_concurrency: 1000 +secrets: + hf_access_token: None +trt_llm: + draft: + build: + base_model: deepseek + checkpoint_repository: + repo: deepseek-ai/deepseek-coder-1.3b-instruct + source: HF + max_seq_len: 10000 + plugin_configuration: + use_paged_context_fmha: true + tensor_parallel_count: 1 + runtime: + batch_scheduler_policy: max_utilization + enable_chunked_context: true + kv_cache_free_gpu_mem_fraction: 0.6 + num_draft_tokens: 4 + target: + build: + base_model: deepseek + checkpoint_repository: + repo: deepseek-ai/deepseek-coder-1.3b-instruct + source: HF + max_draft_len: 10 + max_seq_len: 10000 + plugin_configuration: + use_paged_context_fmha: true + speculative_decoding_mode: DRAFT_TOKENS_EXTERNAL + tensor_parallel_count: 1 + runtime: + batch_scheduler_policy: max_utilization + enable_chunked_context: true + kv_cache_free_gpu_mem_fraction: 0.65 + request_default_max_tokens: 1000 + total_token_limit: 500000 diff --git a/text-embeddings-inference/.internal/Dockerfile b/text-embeddings-inference/.internal/Dockerfile new file mode 100644 index 00000000..de609b3b --- /dev/null +++ b/text-embeddings-inference/.internal/Dockerfile @@ -0,0 +1,9 @@ +ARG TAG=1.6 +# this image builds a truss-compatible image with the text-embeddings-inference image as base +# it mainly requires python3 +# optional, git and git-lfs are installed to allow for easy cloning of the huggingface model repos. +FROM ghcr.io/huggingface/text-embeddings-inference:${TAG} +RUN apt-get update && apt-get install -y python3 python3-pip git git-lfs +RUN git lfs install +ENTRYPOINT ["text-embeddings-router"] +CMD ["--json-output"] diff --git a/text-embeddings-inference/.internal/roll_out_docker.sh b/text-embeddings-inference/.internal/roll_out_docker.sh new file mode 100755 index 00000000..16300475 --- /dev/null +++ b/text-embeddings-inference/.internal/roll_out_docker.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +# Map architectures to prefixes +declare -A ARCHES=( + ["cpu"]="cpu-" + ["turing"]="turing-" + ["ampere80"]="" + ["ampere86"]="86-" + ["adalovelace"]="89-" + ["hopper"]="hopper-" +) + +# Define version and target +VERSION="1.6" +TARGET="baseten/text-embeddings-inference-mirror" + +# Build and push images +for ARCH in "${!ARCHES[@]}"; do + ARCH_PREFIX=${ARCHES[$ARCH]} + TAG="${TARGET}:${ARCH_PREFIX}${VERSION}" + + echo "Building and pushing image for $ARCH: $TAG" + + docker buildx build -t "$TAG" --build-arg TAG="${ARCH_PREFIX}${VERSION}" --push . +done + +echo "All images have been built and pushed." diff --git a/text-embeddings-inference/README.md b/text-embeddings-inference/README.md index cfdc1423..4a490264 100644 --- a/text-embeddings-inference/README.md +++ b/text-embeddings-inference/README.md @@ -1,32 +1,112 @@ -# Text Embeddings Inference Truss (A100) -This is an example of a Truss model that uses the Text Embeddings Inference API. +# Text Embeddings Inference Truss -## How to Deploy -In the `config.yaml` file, you can specify the model to use, as well as other arguments per the [Text Embeddings Inference API](https://huggingface.co/docs/text-embeddings-inference) documentation. -Note that not all models are supported by TEI. +This is a Trussless Customer Server example to deploy [text-embeddings-inference](https://github.com/huggingface/text-embeddings-inference), a high performance server that handles text-embeddings, ranranking and classification models as api. -To run the model, you can use the following command: -```bash -truss push +## Deployment + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` +3. [Required for gated model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`. + +First, clone this repository: + +```sh +git clone https://github.com/basetenlabs/truss-examples.git +cd text-embeddings-inference +``` + +With `text-embeddings-inference` as your working directory, you can deploy the model with the following command, paste your Baseten API key if prompted. + +```sh +truss push --publish +``` + +## Performance Optimization: + +The config.yaml contains a couple of variables that can be tuned, depending on: +- which GPU is used +- which model is deployed +- how many concurrent requests users are sending + +The deployment example is for Bert-large and a Nvidia-L4. Bert-large has a maxiumum sequence length of 512 tokens per sentence. +For Bert-large architecture & the L4, there are marginal gains above a batch-size of 16000 tokens. + +### Concurrent requests +``` +--max-concurrent-requests 40 +# and +runtime: + predict_concurrency : 40 +``` +The following set the number of parallel `post` requests. +In this case we allow 40 parallel requests to be handled per replica & should allow to batch requests from multiple users together, reaching high token counts. Potentially 40 single parallel requests with one sequence each could fully utilize the GPU. `1*40*512=20480` + + +### Tokens per batch +``` +--max-batch-tokens 32768 ``` -## How to Generate Embeddings -The truss expects: -- "texts" parameter with either a single string or an array of strings. -- "stream" parameter with a boolean value (default is false). +This number of total tokens in a batch. For embedding models, this will determine the VRAM usage. +As most of TEI's models are implemented with `nested` attention implementation, `32768 tokens` could mean `64 sentence with 512 tokens` or `512 sentences with 64 tokens`. While the first will take slightly longer to compute, the peak VRAM usage will stay roughly the same. For `llama` or `mistral` based `7b` embedding models, we recommend setting it a lower setting e.g. +``` +--max-batch-tokens 8192 +``` + +### Client batch size +``` +--max-client-batch-size 32 +``` +Client match size determines the number of sentences in a single request. +Increase if clients cannot send multiple concurrent requests, or if clients require to larger requests size. + +### Endpoint, Model Selection, and OpenAPI +Change to /rerank or /predict if you want to use the rerank or predict endpoint. +Embedding model. +Example supported models: https://huggingface.co/models?pipeline_tag=feature-extraction&other=text-embeddings-inference&sort=trending +```yaml + predict_endpoint: /v1/embeddings +``` +Rerank model. +Example models https://huggingface.co/models?pipeline_tag=text-classification&other=text-embeddings-inference&sort=trending +```yaml + predict_endpoint: /rerank +``` +Classification model: +Example classification model: https://huggingface.co/SamLowe/roberta-base-go_emotions +```yaml + predict_endpoint: /predict +``` + +## Call your model + +### curl -To generate embeddings, you can use the following command: ```bash -truss predict --d '{"texts": "This is a test"}' +curl -X POST https://model-xxx.api.baseten.co/development/predict \ + -H "Authorization: Api-Key YOUR_API_KEY" \ + -d '{"input": "text string"}' ``` -# Notes -- The base image is created by installing python on one of the images provided here: https://github.com/huggingface/text-embeddings-inference?tab=readme-ov-file. The current example was built for Ampere 80 architecture, which includes the A100. -- Multi-GPU appears to have no impact on performance -- Be aware of the token limit for each embedding model. It is currently up to the caller to ensure that the texts do not exceed the token limit. -# Improvements -- It may be possible to create a universal base image using the `-all` dockerfile to support a GPU-agnostic implementation -- handle truncation / chunking with averaging (or other technique) when tokens > supported -- investigate impact of dtype on performance -- Add prompt support to embed with prompt +### request python library + +```python +import os +import requests + +resp = requests.post( + "https://model-xxx.api.baseten.co/environments/production/predict", + headers={"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"}, + json={"input": ["text string", "second string"]}, +) + +print(resp.json()) +``` + + +## Support + +If you have any questions or need assistance, please open an issue in this repository or contact our support team. diff --git a/text-embeddings-inference/config.yaml b/text-embeddings-inference/config.yaml index 37d74a70..82332a3c 100644 --- a/text-embeddings-inference/config.yaml +++ b/text-embeddings-inference/config.yaml @@ -1,25 +1,30 @@ base_image: - image: vshulman/ampere-truss-custom-text-embeddings-inference:1.0 - python_executable_path: /usr/bin/python -build: - arguments: - model_id: nomic-ai/nomic-embed-text-v1.5 - model_server: TrussServer -environment_variables: {} -external_package_dirs: [] -model_cache: -- repo_id: nomic-ai/nomic-embed-text-v1.5 -model_metadata: {} -model_name: TEI Experiment -python_version: py39 -runtime: - predict_concurrency: 512 -requirements: [] + # select an image: L4 + # CPU baseten/text-embeddings-inference-mirror:cpu-1.6 + # Turing (T4, ...) baseten/text-embeddings-inference-mirror:turing-1.6 + # Ampere 80 (A100, A30) baseten/text-embeddings-inference-mirror:1.6 + # Ampere 86 (A10, A10G, A40, ...) baseten/text-embeddings-inference-mirror:86-1.6 + # Ada Lovelace (L4, ...) baseten/text-embeddings-inference-mirror:89-1.6 + # Hopper (H100/H100 40GB) baseten/text-embeddings-inference-mirror:hopper-1.6 + image: baseten/text-embeddings-inference-mirror:89-1.6 +model_metadata: + repo_id: BAAI/bge-base-en-v1.5 +docker_server: + start_command: sh -c "text-embeddings-router --port 7997 --model-id /data/local-model --max-client-batch-size 32 --max-concurrent-requests 40 --max-batch-tokens 32768" + readiness_endpoint: /health + liveness_endpoint: /health + # change to /rerank or /predict if you want to use the rerank or predict endpoint + # https://huggingface.github.io/text-embeddings-inference/ + predict_endpoint: /v1/embeddings + server_port: 7997 resources: - accelerator: A100 - cpu: '1' - memory: 2Gi + accelerator: L4 use_gpu: true -secrets: {} -system_packages: -- python3.10-venv +model_name: text-embeddings-inference trussless +build_commands: # optional step to download the weights of the model into the image +- git clone https://huggingface.co/BAAI/bge-base-en-v1.5 /data/local-model +runtime: + predict_concurrency : 40 +environment_variables: + VLLM_LOGGING_LEVEL: WARNING + hf_access_token: null diff --git a/text-embeddings-inference/model/__init__.py b/text-embeddings-inference/model/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/text-embeddings-inference/model/model.py b/text-embeddings-inference/model/model.py deleted file mode 100644 index 95dccc9a..00000000 --- a/text-embeddings-inference/model/model.py +++ /dev/null @@ -1,74 +0,0 @@ -import subprocess -import time -from typing import Any, Dict, List - -import grpc -from tei_pb import tei_pb2, tei_pb2_grpc -from tei_pb.text_embeddings_router_config import Config - - -class Model: - MAX_FAILED_SECONDS = 600 # 10 minutes; the reason this would take this long is mostly if we download a large model - - def __init__(self, data_dir, config, secrets): - self._secrets = secrets - self._config = config - - def load(self): - config = Config(self._config["build"]["arguments"]) - config.run_router() - - # Health check loop - channel = grpc.insecure_channel("localhost:80") - stub = tei_pb2_grpc.InfoStub(channel) - healthy = False - failed_seconds = 0 - print( - f"Waiting for model to be ready for up to {self.MAX_FAILED_SECONDS} seconds" - ) - while not healthy and failed_seconds < self.MAX_FAILED_SECONDS: - try: - response = stub.Info(tei_pb2.InfoRequest()) - if ( - response.model_id - ): # Assuming a valid model_id indicates the service is serving - healthy = True - print("Model is ready") - else: - failed_seconds += 1 - time.sleep(1) # wait for a second before retrying - except grpc.RpcError: - failed_seconds += 1 - time.sleep(1) # wait and retry if server is not up yet - - async def predict(self, model_input): - texts = model_input.pop("texts") - stream = model_input.pop("stream", False) - - if isinstance(texts, str): - texts = [texts] - - print(f"Starting to embed {len(texts)} texts") - requests = [tei_pb2.EmbedRequest(inputs=text) for text in texts] - - async def generator(): - with grpc.insecure_channel("localhost:80") as channel: - stub = tei_pb2_grpc.EmbedStub(channel) - responses = stub.EmbedStream(iter(requests)) - for response in responses: - yield list(response.embeddings) - - if stream: - return generator() - else: - embeddings = [] - async for embedding in generator(): - embeddings.append(embedding) - - return {"embeddings": embeddings} - - -if __name__ == "__main__": - model = Model() - model.load() - print(model.predict("What is Deep Learning")) diff --git a/text-embeddings-inference/packages/tei_pb/tei.proto b/text-embeddings-inference/packages/tei_pb/tei.proto deleted file mode 100644 index 6538e34a..00000000 --- a/text-embeddings-inference/packages/tei_pb/tei.proto +++ /dev/null @@ -1,187 +0,0 @@ -syntax = "proto3"; - -package tei.v1; - -service Info { - rpc Info (InfoRequest) returns (InfoResponse) { - option idempotency_level = IDEMPOTENT; - }; -} - -service Embed { - rpc Embed (EmbedRequest) returns (EmbedResponse); - rpc EmbedStream (stream EmbedRequest) returns (stream EmbedResponse); - rpc EmbedSparse (EmbedSparseRequest) returns (EmbedSparseResponse); - rpc EmbedSparseStream (stream EmbedSparseRequest) returns (stream EmbedSparseResponse); - rpc EmbedAll (EmbedAllRequest) returns (EmbedAllResponse); - rpc EmbedAllStream (stream EmbedAllRequest) returns (stream EmbedAllResponse); -} - -service Predict { - rpc Predict (PredictRequest) returns (PredictResponse); - rpc PredictPair (PredictPairRequest) returns (PredictResponse); - rpc PredictStream (stream PredictRequest) returns (stream PredictResponse); - rpc PredictPairStream (stream PredictPairRequest) returns (stream PredictResponse); -} - -service Rerank { - rpc Rerank (RerankRequest) returns (RerankResponse); - rpc RerankStream (stream RerankStreamRequest) returns (RerankResponse); -} - -service Tokenize { - rpc Tokenize (EncodeRequest) returns (EncodeResponse); - rpc TokenizeStream (stream EncodeRequest) returns (stream EncodeResponse); - rpc Decode (DecodeRequest) returns (DecodeResponse); - rpc DecodeStream (stream DecodeRequest) returns (stream DecodeResponse); -} - -message InfoRequest {} - -enum ModelType { - MODEL_TYPE_EMBEDDING = 0; - MODEL_TYPE_CLASSIFIER = 1; - MODEL_TYPE_RERANKER = 2; -} - -message InfoResponse { - string version = 1; - optional string sha = 2; - optional string docker_label = 3; - string model_id = 4; - optional string model_sha = 5; - string model_dtype = 6; - ModelType model_type = 7; - uint32 max_concurrent_requests = 8; - uint32 max_input_length = 9; - uint32 max_batch_tokens = 10; - optional uint32 max_batch_requests = 11; - uint32 max_client_batch_size = 12; - uint32 tokenization_workers = 13; -} - -message Metadata { - uint32 compute_chars = 1; - uint32 compute_tokens = 2; - uint64 total_time_ns = 3; - uint64 tokenization_time_ns = 4; - uint64 queue_time_ns = 5; - uint64 inference_time_ns = 6; -} - -message EmbedRequest { - string inputs = 1; - bool truncate = 2; - bool normalize = 3; -} - -message EmbedResponse { - repeated float embeddings = 1; - Metadata metadata = 2; -} - -message EmbedSparseRequest { - string inputs = 1; - bool truncate = 2; -} - -message SparseValue { - uint32 index = 1; - float value = 2; -} - -message EmbedSparseResponse { - repeated SparseValue sparse_embeddings = 1; - Metadata metadata = 2; -} - -message EmbedAllRequest { - string inputs = 1; - bool truncate = 2; -} - -message TokenEmbedding { - repeated float embeddings = 1; -} - -message EmbedAllResponse { - repeated TokenEmbedding token_embeddings = 1; - Metadata metadata = 2; -} - -message PredictRequest { - string inputs = 1; - bool truncate = 2; - bool raw_scores = 3; -} - -message PredictPairRequest { - repeated string inputs = 1; - bool truncate = 2; - bool raw_scores = 3; -} - -message Prediction { - float score = 1; - string label = 2; -} - -message PredictResponse { - repeated Prediction predictions = 1; - Metadata metadata = 2; -} - -message RerankRequest { - string query = 1; - repeated string texts = 2; - bool truncate = 3; - bool raw_scores = 4; - bool return_text = 5; -} - -message RerankStreamRequest{ - string query = 1; - string text = 2; - bool truncate = 3; - // The server will only consider the first value - bool raw_scores = 4; - // The server will only consider the first value - bool return_text = 5; -} - -message Rank { - uint32 index = 1; - optional string text = 2; - float score = 3; -} - -message RerankResponse { - repeated Rank ranks = 1; - Metadata metadata = 2; -} - -message EncodeRequest { - string inputs = 1; - bool add_special_tokens = 2; -} - -message SimpleToken { - uint32 id = 1; - string text = 2; - bool special = 3; - optional uint32 start = 4; - optional uint32 stop = 5; -} - -message EncodeResponse { - repeated SimpleToken tokens = 1; -} - -message DecodeRequest { - repeated uint32 ids = 1; - bool skip_special_tokens = 2; -} - -message DecodeResponse { - string text = 1; -} diff --git a/text-embeddings-inference/packages/tei_pb/tei_pb2.py b/text-embeddings-inference/packages/tei_pb/tei_pb2.py deleted file mode 100644 index 5260d967..00000000 --- a/text-embeddings-inference/packages/tei_pb/tei_pb2.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: tei.proto -# Protobuf Python Version: 5.26.1 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\ttei.proto\x12\x06tei.v1"\r\n\x0bInfoRequest"\xa3\x03\n\x0cInfoResponse\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x10\n\x03sha\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x64ocker_label\x18\x03 \x01(\tH\x01\x88\x01\x01\x12\x10\n\x08model_id\x18\x04 \x01(\t\x12\x16\n\tmodel_sha\x18\x05 \x01(\tH\x02\x88\x01\x01\x12\x13\n\x0bmodel_dtype\x18\x06 \x01(\t\x12%\n\nmodel_type\x18\x07 \x01(\x0e\x32\x11.tei.v1.ModelType\x12\x1f\n\x17max_concurrent_requests\x18\x08 \x01(\r\x12\x18\n\x10max_input_length\x18\t \x01(\r\x12\x18\n\x10max_batch_tokens\x18\n \x01(\r\x12\x1f\n\x12max_batch_requests\x18\x0b \x01(\rH\x03\x88\x01\x01\x12\x1d\n\x15max_client_batch_size\x18\x0c \x01(\r\x12\x1c\n\x14tokenization_workers\x18\r \x01(\rB\x06\n\x04_shaB\x0f\n\r_docker_labelB\x0c\n\n_model_shaB\x15\n\x13_max_batch_requests"\xa0\x01\n\x08Metadata\x12\x15\n\rcompute_chars\x18\x01 \x01(\r\x12\x16\n\x0e\x63ompute_tokens\x18\x02 \x01(\r\x12\x15\n\rtotal_time_ns\x18\x03 \x01(\x04\x12\x1c\n\x14tokenization_time_ns\x18\x04 \x01(\x04\x12\x15\n\rqueue_time_ns\x18\x05 \x01(\x04\x12\x19\n\x11inference_time_ns\x18\x06 \x01(\x04"C\n\x0c\x45mbedRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08\x12\x11\n\tnormalize\x18\x03 \x01(\x08"G\n\rEmbedResponse\x12\x12\n\nembeddings\x18\x01 \x03(\x02\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata"6\n\x12\x45mbedSparseRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08"+\n\x0bSparseValue\x12\r\n\x05index\x18\x01 \x01(\r\x12\r\n\x05value\x18\x02 \x01(\x02"i\n\x13\x45mbedSparseResponse\x12.\n\x11sparse_embeddings\x18\x01 \x03(\x0b\x32\x13.tei.v1.SparseValue\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata"3\n\x0f\x45mbedAllRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08"$\n\x0eTokenEmbedding\x12\x12\n\nembeddings\x18\x01 \x03(\x02"h\n\x10\x45mbedAllResponse\x12\x30\n\x10token_embeddings\x18\x01 \x03(\x0b\x32\x16.tei.v1.TokenEmbedding\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata"F\n\x0ePredictRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08\x12\x12\n\nraw_scores\x18\x03 \x01(\x08"J\n\x12PredictPairRequest\x12\x0e\n\x06inputs\x18\x01 \x03(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08\x12\x12\n\nraw_scores\x18\x03 \x01(\x08"*\n\nPrediction\x12\r\n\x05score\x18\x01 \x01(\x02\x12\r\n\x05label\x18\x02 \x01(\t"^\n\x0fPredictResponse\x12\'\n\x0bpredictions\x18\x01 \x03(\x0b\x32\x12.tei.v1.Prediction\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata"h\n\rRerankRequest\x12\r\n\x05query\x18\x01 \x01(\t\x12\r\n\x05texts\x18\x02 \x03(\t\x12\x10\n\x08truncate\x18\x03 \x01(\x08\x12\x12\n\nraw_scores\x18\x04 \x01(\x08\x12\x13\n\x0breturn_text\x18\x05 \x01(\x08"m\n\x13RerankStreamRequest\x12\r\n\x05query\x18\x01 \x01(\t\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x10\n\x08truncate\x18\x03 \x01(\x08\x12\x12\n\nraw_scores\x18\x04 \x01(\x08\x12\x13\n\x0breturn_text\x18\x05 \x01(\x08"@\n\x04Rank\x12\r\n\x05index\x18\x01 \x01(\r\x12\x11\n\x04text\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\r\n\x05score\x18\x03 \x01(\x02\x42\x07\n\x05_text"Q\n\x0eRerankResponse\x12\x1b\n\x05ranks\x18\x01 \x03(\x0b\x32\x0c.tei.v1.Rank\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata";\n\rEncodeRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x1a\n\x12\x61\x64\x64_special_tokens\x18\x02 \x01(\x08"r\n\x0bSimpleToken\x12\n\n\x02id\x18\x01 \x01(\r\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x0f\n\x07special\x18\x03 \x01(\x08\x12\x12\n\x05start\x18\x04 \x01(\rH\x00\x88\x01\x01\x12\x11\n\x04stop\x18\x05 \x01(\rH\x01\x88\x01\x01\x42\x08\n\x06_startB\x07\n\x05_stop"5\n\x0e\x45ncodeResponse\x12#\n\x06tokens\x18\x01 \x03(\x0b\x32\x13.tei.v1.SimpleToken"9\n\rDecodeRequest\x12\x0b\n\x03ids\x18\x01 \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x02 \x01(\x08"\x1e\n\x0e\x44\x65\x63odeResponse\x12\x0c\n\x04text\x18\x01 \x01(\t*Y\n\tModelType\x12\x18\n\x14MODEL_TYPE_EMBEDDING\x10\x00\x12\x19\n\x15MODEL_TYPE_CLASSIFIER\x10\x01\x12\x17\n\x13MODEL_TYPE_RERANKER\x10\x02\x32>\n\x04Info\x12\x36\n\x04Info\x12\x13.tei.v1.InfoRequest\x1a\x14.tei.v1.InfoResponse"\x03\x90\x02\x02\x32\x9f\x03\n\x05\x45mbed\x12\x34\n\x05\x45mbed\x12\x14.tei.v1.EmbedRequest\x1a\x15.tei.v1.EmbedResponse\x12>\n\x0b\x45mbedStream\x12\x14.tei.v1.EmbedRequest\x1a\x15.tei.v1.EmbedResponse(\x01\x30\x01\x12\x46\n\x0b\x45mbedSparse\x12\x1a.tei.v1.EmbedSparseRequest\x1a\x1b.tei.v1.EmbedSparseResponse\x12P\n\x11\x45mbedSparseStream\x12\x1a.tei.v1.EmbedSparseRequest\x1a\x1b.tei.v1.EmbedSparseResponse(\x01\x30\x01\x12=\n\x08\x45mbedAll\x12\x17.tei.v1.EmbedAllRequest\x1a\x18.tei.v1.EmbedAllResponse\x12G\n\x0e\x45mbedAllStream\x12\x17.tei.v1.EmbedAllRequest\x1a\x18.tei.v1.EmbedAllResponse(\x01\x30\x01\x32\x9d\x02\n\x07Predict\x12:\n\x07Predict\x12\x16.tei.v1.PredictRequest\x1a\x17.tei.v1.PredictResponse\x12\x42\n\x0bPredictPair\x12\x1a.tei.v1.PredictPairRequest\x1a\x17.tei.v1.PredictResponse\x12\x44\n\rPredictStream\x12\x16.tei.v1.PredictRequest\x1a\x17.tei.v1.PredictResponse(\x01\x30\x01\x12L\n\x11PredictPairStream\x12\x1a.tei.v1.PredictPairRequest\x1a\x17.tei.v1.PredictResponse(\x01\x30\x01\x32\x88\x01\n\x06Rerank\x12\x37\n\x06Rerank\x12\x15.tei.v1.RerankRequest\x1a\x16.tei.v1.RerankResponse\x12\x45\n\x0cRerankStream\x12\x1b.tei.v1.RerankStreamRequest\x1a\x16.tei.v1.RerankResponse(\x01\x32\x86\x02\n\x08Tokenize\x12\x39\n\x08Tokenize\x12\x15.tei.v1.EncodeRequest\x1a\x16.tei.v1.EncodeResponse\x12\x43\n\x0eTokenizeStream\x12\x15.tei.v1.EncodeRequest\x1a\x16.tei.v1.EncodeResponse(\x01\x30\x01\x12\x37\n\x06\x44\x65\x63ode\x12\x15.tei.v1.DecodeRequest\x1a\x16.tei.v1.DecodeResponse\x12\x41\n\x0c\x44\x65\x63odeStream\x12\x15.tei.v1.DecodeRequest\x1a\x16.tei.v1.DecodeResponse(\x01\x30\x01\x62\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "tei_pb2", _globals) -if not _descriptor._USE_C_DESCRIPTORS: - DESCRIPTOR._loaded_options = None - _globals["_INFO"].methods_by_name["Info"]._loaded_options = None - _globals["_INFO"].methods_by_name["Info"]._serialized_options = b"\220\002\002" - _globals["_MODELTYPE"]._serialized_start = 2145 - _globals["_MODELTYPE"]._serialized_end = 2234 - _globals["_INFOREQUEST"]._serialized_start = 21 - _globals["_INFOREQUEST"]._serialized_end = 34 - _globals["_INFORESPONSE"]._serialized_start = 37 - _globals["_INFORESPONSE"]._serialized_end = 456 - _globals["_METADATA"]._serialized_start = 459 - _globals["_METADATA"]._serialized_end = 619 - _globals["_EMBEDREQUEST"]._serialized_start = 621 - _globals["_EMBEDREQUEST"]._serialized_end = 688 - _globals["_EMBEDRESPONSE"]._serialized_start = 690 - _globals["_EMBEDRESPONSE"]._serialized_end = 761 - _globals["_EMBEDSPARSEREQUEST"]._serialized_start = 763 - _globals["_EMBEDSPARSEREQUEST"]._serialized_end = 817 - _globals["_SPARSEVALUE"]._serialized_start = 819 - _globals["_SPARSEVALUE"]._serialized_end = 862 - _globals["_EMBEDSPARSERESPONSE"]._serialized_start = 864 - _globals["_EMBEDSPARSERESPONSE"]._serialized_end = 969 - _globals["_EMBEDALLREQUEST"]._serialized_start = 971 - _globals["_EMBEDALLREQUEST"]._serialized_end = 1022 - _globals["_TOKENEMBEDDING"]._serialized_start = 1024 - _globals["_TOKENEMBEDDING"]._serialized_end = 1060 - _globals["_EMBEDALLRESPONSE"]._serialized_start = 1062 - _globals["_EMBEDALLRESPONSE"]._serialized_end = 1166 - _globals["_PREDICTREQUEST"]._serialized_start = 1168 - _globals["_PREDICTREQUEST"]._serialized_end = 1238 - _globals["_PREDICTPAIRREQUEST"]._serialized_start = 1240 - _globals["_PREDICTPAIRREQUEST"]._serialized_end = 1314 - _globals["_PREDICTION"]._serialized_start = 1316 - _globals["_PREDICTION"]._serialized_end = 1358 - _globals["_PREDICTRESPONSE"]._serialized_start = 1360 - _globals["_PREDICTRESPONSE"]._serialized_end = 1454 - _globals["_RERANKREQUEST"]._serialized_start = 1456 - _globals["_RERANKREQUEST"]._serialized_end = 1560 - _globals["_RERANKSTREAMREQUEST"]._serialized_start = 1562 - _globals["_RERANKSTREAMREQUEST"]._serialized_end = 1671 - _globals["_RANK"]._serialized_start = 1673 - _globals["_RANK"]._serialized_end = 1737 - _globals["_RERANKRESPONSE"]._serialized_start = 1739 - _globals["_RERANKRESPONSE"]._serialized_end = 1820 - _globals["_ENCODEREQUEST"]._serialized_start = 1822 - _globals["_ENCODEREQUEST"]._serialized_end = 1881 - _globals["_SIMPLETOKEN"]._serialized_start = 1883 - _globals["_SIMPLETOKEN"]._serialized_end = 1997 - _globals["_ENCODERESPONSE"]._serialized_start = 1999 - _globals["_ENCODERESPONSE"]._serialized_end = 2052 - _globals["_DECODEREQUEST"]._serialized_start = 2054 - _globals["_DECODEREQUEST"]._serialized_end = 2111 - _globals["_DECODERESPONSE"]._serialized_start = 2113 - _globals["_DECODERESPONSE"]._serialized_end = 2143 - _globals["_INFO"]._serialized_start = 2236 - _globals["_INFO"]._serialized_end = 2298 - _globals["_EMBED"]._serialized_start = 2301 - _globals["_EMBED"]._serialized_end = 2716 - _globals["_PREDICT"]._serialized_start = 2719 - _globals["_PREDICT"]._serialized_end = 3004 - _globals["_RERANK"]._serialized_start = 3007 - _globals["_RERANK"]._serialized_end = 3143 - _globals["_TOKENIZE"]._serialized_start = 3146 - _globals["_TOKENIZE"]._serialized_end = 3408 -# @@protoc_insertion_point(module_scope) diff --git a/text-embeddings-inference/packages/tei_pb/tei_pb2_grpc.py b/text-embeddings-inference/packages/tei_pb/tei_pb2_grpc.py deleted file mode 100644 index def8ea5a..00000000 --- a/text-embeddings-inference/packages/tei_pb/tei_pb2_grpc.py +++ /dev/null @@ -1,983 +0,0 @@ -# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! -"""Client and server classes corresponding to protobuf-defined services.""" -import warnings - -import grpc - -from . import tei_pb2 as tei__pb2 - -GRPC_GENERATED_VERSION = "1.64.1" -GRPC_VERSION = grpc.__version__ -EXPECTED_ERROR_RELEASE = "1.65.0" -SCHEDULED_RELEASE_DATE = "June 25, 2024" -_version_not_supported = False - -try: - from grpc._utilities import first_version_is_lower - - _version_not_supported = first_version_is_lower( - GRPC_VERSION, GRPC_GENERATED_VERSION - ) -except ImportError: - _version_not_supported = True - -if _version_not_supported: - warnings.warn( - f"The grpc package installed is at version {GRPC_VERSION}," - + f" but the generated code in tei_pb2_grpc.py depends on" - + f" grpcio>={GRPC_GENERATED_VERSION}." - + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}" - + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}." - + f" This warning will become an error in {EXPECTED_ERROR_RELEASE}," - + f" scheduled for release on {SCHEDULED_RELEASE_DATE}.", - RuntimeWarning, - ) - - -class InfoStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.Info = channel.unary_unary( - "/tei.v1.Info/Info", - request_serializer=tei__pb2.InfoRequest.SerializeToString, - response_deserializer=tei__pb2.InfoResponse.FromString, - _registered_method=True, - ) - - -class InfoServicer(object): - """Missing associated documentation comment in .proto file.""" - - def Info(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_InfoServicer_to_server(servicer, server): - rpc_method_handlers = { - "Info": grpc.unary_unary_rpc_method_handler( - servicer.Info, - request_deserializer=tei__pb2.InfoRequest.FromString, - response_serializer=tei__pb2.InfoResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "tei.v1.Info", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - server.add_registered_method_handlers("tei.v1.Info", rpc_method_handlers) - - -# This class is part of an EXPERIMENTAL API. -class Info(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def Info( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Info/Info", - tei__pb2.InfoRequest.SerializeToString, - tei__pb2.InfoResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - -class EmbedStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.Embed = channel.unary_unary( - "/tei.v1.Embed/Embed", - request_serializer=tei__pb2.EmbedRequest.SerializeToString, - response_deserializer=tei__pb2.EmbedResponse.FromString, - _registered_method=True, - ) - self.EmbedStream = channel.stream_stream( - "/tei.v1.Embed/EmbedStream", - request_serializer=tei__pb2.EmbedRequest.SerializeToString, - response_deserializer=tei__pb2.EmbedResponse.FromString, - _registered_method=True, - ) - self.EmbedSparse = channel.unary_unary( - "/tei.v1.Embed/EmbedSparse", - request_serializer=tei__pb2.EmbedSparseRequest.SerializeToString, - response_deserializer=tei__pb2.EmbedSparseResponse.FromString, - _registered_method=True, - ) - self.EmbedSparseStream = channel.stream_stream( - "/tei.v1.Embed/EmbedSparseStream", - request_serializer=tei__pb2.EmbedSparseRequest.SerializeToString, - response_deserializer=tei__pb2.EmbedSparseResponse.FromString, - _registered_method=True, - ) - self.EmbedAll = channel.unary_unary( - "/tei.v1.Embed/EmbedAll", - request_serializer=tei__pb2.EmbedAllRequest.SerializeToString, - response_deserializer=tei__pb2.EmbedAllResponse.FromString, - _registered_method=True, - ) - self.EmbedAllStream = channel.stream_stream( - "/tei.v1.Embed/EmbedAllStream", - request_serializer=tei__pb2.EmbedAllRequest.SerializeToString, - response_deserializer=tei__pb2.EmbedAllResponse.FromString, - _registered_method=True, - ) - - -class EmbedServicer(object): - """Missing associated documentation comment in .proto file.""" - - def Embed(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def EmbedStream(self, request_iterator, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def EmbedSparse(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def EmbedSparseStream(self, request_iterator, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def EmbedAll(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def EmbedAllStream(self, request_iterator, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_EmbedServicer_to_server(servicer, server): - rpc_method_handlers = { - "Embed": grpc.unary_unary_rpc_method_handler( - servicer.Embed, - request_deserializer=tei__pb2.EmbedRequest.FromString, - response_serializer=tei__pb2.EmbedResponse.SerializeToString, - ), - "EmbedStream": grpc.stream_stream_rpc_method_handler( - servicer.EmbedStream, - request_deserializer=tei__pb2.EmbedRequest.FromString, - response_serializer=tei__pb2.EmbedResponse.SerializeToString, - ), - "EmbedSparse": grpc.unary_unary_rpc_method_handler( - servicer.EmbedSparse, - request_deserializer=tei__pb2.EmbedSparseRequest.FromString, - response_serializer=tei__pb2.EmbedSparseResponse.SerializeToString, - ), - "EmbedSparseStream": grpc.stream_stream_rpc_method_handler( - servicer.EmbedSparseStream, - request_deserializer=tei__pb2.EmbedSparseRequest.FromString, - response_serializer=tei__pb2.EmbedSparseResponse.SerializeToString, - ), - "EmbedAll": grpc.unary_unary_rpc_method_handler( - servicer.EmbedAll, - request_deserializer=tei__pb2.EmbedAllRequest.FromString, - response_serializer=tei__pb2.EmbedAllResponse.SerializeToString, - ), - "EmbedAllStream": grpc.stream_stream_rpc_method_handler( - servicer.EmbedAllStream, - request_deserializer=tei__pb2.EmbedAllRequest.FromString, - response_serializer=tei__pb2.EmbedAllResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "tei.v1.Embed", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - server.add_registered_method_handlers("tei.v1.Embed", rpc_method_handlers) - - -# This class is part of an EXPERIMENTAL API. -class Embed(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def Embed( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Embed/Embed", - tei__pb2.EmbedRequest.SerializeToString, - tei__pb2.EmbedResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def EmbedStream( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, - target, - "/tei.v1.Embed/EmbedStream", - tei__pb2.EmbedRequest.SerializeToString, - tei__pb2.EmbedResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def EmbedSparse( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Embed/EmbedSparse", - tei__pb2.EmbedSparseRequest.SerializeToString, - tei__pb2.EmbedSparseResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def EmbedSparseStream( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, - target, - "/tei.v1.Embed/EmbedSparseStream", - tei__pb2.EmbedSparseRequest.SerializeToString, - tei__pb2.EmbedSparseResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def EmbedAll( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Embed/EmbedAll", - tei__pb2.EmbedAllRequest.SerializeToString, - tei__pb2.EmbedAllResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def EmbedAllStream( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, - target, - "/tei.v1.Embed/EmbedAllStream", - tei__pb2.EmbedAllRequest.SerializeToString, - tei__pb2.EmbedAllResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - -class PredictStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.Predict = channel.unary_unary( - "/tei.v1.Predict/Predict", - request_serializer=tei__pb2.PredictRequest.SerializeToString, - response_deserializer=tei__pb2.PredictResponse.FromString, - _registered_method=True, - ) - self.PredictPair = channel.unary_unary( - "/tei.v1.Predict/PredictPair", - request_serializer=tei__pb2.PredictPairRequest.SerializeToString, - response_deserializer=tei__pb2.PredictResponse.FromString, - _registered_method=True, - ) - self.PredictStream = channel.stream_stream( - "/tei.v1.Predict/PredictStream", - request_serializer=tei__pb2.PredictRequest.SerializeToString, - response_deserializer=tei__pb2.PredictResponse.FromString, - _registered_method=True, - ) - self.PredictPairStream = channel.stream_stream( - "/tei.v1.Predict/PredictPairStream", - request_serializer=tei__pb2.PredictPairRequest.SerializeToString, - response_deserializer=tei__pb2.PredictResponse.FromString, - _registered_method=True, - ) - - -class PredictServicer(object): - """Missing associated documentation comment in .proto file.""" - - def Predict(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def PredictPair(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def PredictStream(self, request_iterator, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def PredictPairStream(self, request_iterator, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_PredictServicer_to_server(servicer, server): - rpc_method_handlers = { - "Predict": grpc.unary_unary_rpc_method_handler( - servicer.Predict, - request_deserializer=tei__pb2.PredictRequest.FromString, - response_serializer=tei__pb2.PredictResponse.SerializeToString, - ), - "PredictPair": grpc.unary_unary_rpc_method_handler( - servicer.PredictPair, - request_deserializer=tei__pb2.PredictPairRequest.FromString, - response_serializer=tei__pb2.PredictResponse.SerializeToString, - ), - "PredictStream": grpc.stream_stream_rpc_method_handler( - servicer.PredictStream, - request_deserializer=tei__pb2.PredictRequest.FromString, - response_serializer=tei__pb2.PredictResponse.SerializeToString, - ), - "PredictPairStream": grpc.stream_stream_rpc_method_handler( - servicer.PredictPairStream, - request_deserializer=tei__pb2.PredictPairRequest.FromString, - response_serializer=tei__pb2.PredictResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "tei.v1.Predict", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - server.add_registered_method_handlers("tei.v1.Predict", rpc_method_handlers) - - -# This class is part of an EXPERIMENTAL API. -class Predict(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def Predict( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Predict/Predict", - tei__pb2.PredictRequest.SerializeToString, - tei__pb2.PredictResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def PredictPair( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Predict/PredictPair", - tei__pb2.PredictPairRequest.SerializeToString, - tei__pb2.PredictResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def PredictStream( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, - target, - "/tei.v1.Predict/PredictStream", - tei__pb2.PredictRequest.SerializeToString, - tei__pb2.PredictResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def PredictPairStream( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, - target, - "/tei.v1.Predict/PredictPairStream", - tei__pb2.PredictPairRequest.SerializeToString, - tei__pb2.PredictResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - -class RerankStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.Rerank = channel.unary_unary( - "/tei.v1.Rerank/Rerank", - request_serializer=tei__pb2.RerankRequest.SerializeToString, - response_deserializer=tei__pb2.RerankResponse.FromString, - _registered_method=True, - ) - self.RerankStream = channel.stream_unary( - "/tei.v1.Rerank/RerankStream", - request_serializer=tei__pb2.RerankStreamRequest.SerializeToString, - response_deserializer=tei__pb2.RerankResponse.FromString, - _registered_method=True, - ) - - -class RerankServicer(object): - """Missing associated documentation comment in .proto file.""" - - def Rerank(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def RerankStream(self, request_iterator, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_RerankServicer_to_server(servicer, server): - rpc_method_handlers = { - "Rerank": grpc.unary_unary_rpc_method_handler( - servicer.Rerank, - request_deserializer=tei__pb2.RerankRequest.FromString, - response_serializer=tei__pb2.RerankResponse.SerializeToString, - ), - "RerankStream": grpc.stream_unary_rpc_method_handler( - servicer.RerankStream, - request_deserializer=tei__pb2.RerankStreamRequest.FromString, - response_serializer=tei__pb2.RerankResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "tei.v1.Rerank", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - server.add_registered_method_handlers("tei.v1.Rerank", rpc_method_handlers) - - -# This class is part of an EXPERIMENTAL API. -class Rerank(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def Rerank( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Rerank/Rerank", - tei__pb2.RerankRequest.SerializeToString, - tei__pb2.RerankResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def RerankStream( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_unary( - request_iterator, - target, - "/tei.v1.Rerank/RerankStream", - tei__pb2.RerankStreamRequest.SerializeToString, - tei__pb2.RerankResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - -class TokenizeStub(object): - """Missing associated documentation comment in .proto file.""" - - def __init__(self, channel): - """Constructor. - - Args: - channel: A grpc.Channel. - """ - self.Tokenize = channel.unary_unary( - "/tei.v1.Tokenize/Tokenize", - request_serializer=tei__pb2.EncodeRequest.SerializeToString, - response_deserializer=tei__pb2.EncodeResponse.FromString, - _registered_method=True, - ) - self.TokenizeStream = channel.stream_stream( - "/tei.v1.Tokenize/TokenizeStream", - request_serializer=tei__pb2.EncodeRequest.SerializeToString, - response_deserializer=tei__pb2.EncodeResponse.FromString, - _registered_method=True, - ) - self.Decode = channel.unary_unary( - "/tei.v1.Tokenize/Decode", - request_serializer=tei__pb2.DecodeRequest.SerializeToString, - response_deserializer=tei__pb2.DecodeResponse.FromString, - _registered_method=True, - ) - self.DecodeStream = channel.stream_stream( - "/tei.v1.Tokenize/DecodeStream", - request_serializer=tei__pb2.DecodeRequest.SerializeToString, - response_deserializer=tei__pb2.DecodeResponse.FromString, - _registered_method=True, - ) - - -class TokenizeServicer(object): - """Missing associated documentation comment in .proto file.""" - - def Tokenize(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def TokenizeStream(self, request_iterator, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def Decode(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - def DecodeStream(self, request_iterator, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") - - -def add_TokenizeServicer_to_server(servicer, server): - rpc_method_handlers = { - "Tokenize": grpc.unary_unary_rpc_method_handler( - servicer.Tokenize, - request_deserializer=tei__pb2.EncodeRequest.FromString, - response_serializer=tei__pb2.EncodeResponse.SerializeToString, - ), - "TokenizeStream": grpc.stream_stream_rpc_method_handler( - servicer.TokenizeStream, - request_deserializer=tei__pb2.EncodeRequest.FromString, - response_serializer=tei__pb2.EncodeResponse.SerializeToString, - ), - "Decode": grpc.unary_unary_rpc_method_handler( - servicer.Decode, - request_deserializer=tei__pb2.DecodeRequest.FromString, - response_serializer=tei__pb2.DecodeResponse.SerializeToString, - ), - "DecodeStream": grpc.stream_stream_rpc_method_handler( - servicer.DecodeStream, - request_deserializer=tei__pb2.DecodeRequest.FromString, - response_serializer=tei__pb2.DecodeResponse.SerializeToString, - ), - } - generic_handler = grpc.method_handlers_generic_handler( - "tei.v1.Tokenize", rpc_method_handlers - ) - server.add_generic_rpc_handlers((generic_handler,)) - server.add_registered_method_handlers("tei.v1.Tokenize", rpc_method_handlers) - - -# This class is part of an EXPERIMENTAL API. -class Tokenize(object): - """Missing associated documentation comment in .proto file.""" - - @staticmethod - def Tokenize( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Tokenize/Tokenize", - tei__pb2.EncodeRequest.SerializeToString, - tei__pb2.EncodeResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def TokenizeStream( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, - target, - "/tei.v1.Tokenize/TokenizeStream", - tei__pb2.EncodeRequest.SerializeToString, - tei__pb2.EncodeResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def Decode( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, - target, - "/tei.v1.Tokenize/Decode", - tei__pb2.DecodeRequest.SerializeToString, - tei__pb2.DecodeResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) - - @staticmethod - def DecodeStream( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, - target, - "/tei.v1.Tokenize/DecodeStream", - tei__pb2.DecodeRequest.SerializeToString, - tei__pb2.DecodeResponse.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - _registered_method=True, - ) diff --git a/text-embeddings-inference/packages/tei_pb/text_embeddings_router_config.py b/text-embeddings-inference/packages/tei_pb/text_embeddings_router_config.py deleted file mode 100644 index b5764655..00000000 --- a/text-embeddings-inference/packages/tei_pb/text_embeddings_router_config.py +++ /dev/null @@ -1,53 +0,0 @@ -import subprocess -from typing import Any, Dict, List, Optional - - -class Config: - def __init__(self, config: Dict[str, Any]): - self.config = config - self._validate_config() - - def _validate_config(self): - required_keys = ["model_id"] - for key in required_keys: - if key not in self.config: - raise ValueError(f"Missing required configuration key: {key}") - - def get_command(self) -> List[str]: - command = [ - "/usr/local/bin/text-embeddings-router", - "--port", - "80", - "--model-id", - self.config["model_id"], - ] - - optional_params = { - "revision": "--revision", - "tokenization_workers": "--tokenization-workers", - "dtype": "--dtype", - "pooling": "--pooling", - "max_concurrent_requests": "--max-concurrent-requests", - "max_batch_tokens": "--max-batch-tokens", - "max_batch_requests": "--max-batch-requests", - "max_client_batch_size": "--max-client-batch-size", - "hf_api_token": "--hf-api-token", - "uds_path": "--uds-path", - "huggingface_hub_cache": "--huggingface-hub-cache", - "payload_limit": "--payload-limit", - "api_key": "--api-key", - "json_output": "--json-output", - "otlp_endpoint": "--otlp-endpoint", - } - - for key, param in optional_params.items(): - value = self.config.get(key) - if value is not None: - command.extend([param, str(value)]) - - return command - - def run_router(self): - command = self.get_command() - with open("/var/log/text_embeddings_router.log", "w") as log_file: - subprocess.Popen(command, stdout=log_file, stderr=log_file)