diff --git a/internal/config.yaml b/internal/config.yaml
new file mode 100644
index 00000000..59d73f91
--- /dev/null
+++ b/internal/config.yaml
@@ -0,0 +1,50 @@
+
+model_metadata:
+  tags:
+  - openai-compatible
+model_name: briton-spec-dec
+python_version: py310
+requirements: []
+resources:
+  accelerator: A10G
+  cpu: '1'
+  memory: 24Gi
+  use_gpu: true
+runtime:
+  predict_concurrency: 1000
+secrets:
+  hf_access_token: None
+trt_llm:
+  draft:
+    build:
+      base_model: deepseek
+      checkpoint_repository:
+        repo: deepseek-ai/deepseek-coder-1.3b-instruct
+        source: HF
+      max_seq_len: 10000
+      plugin_configuration:
+        use_paged_context_fmha: true
+      tensor_parallel_count: 1
+    runtime:
+      batch_scheduler_policy: max_utilization
+      enable_chunked_context: true
+      kv_cache_free_gpu_mem_fraction: 0.6
+      num_draft_tokens: 4
+  target:
+    build:
+      base_model: deepseek
+      checkpoint_repository:
+        repo: deepseek-ai/deepseek-coder-1.3b-instruct
+        source: HF
+      max_draft_len: 10
+      max_seq_len: 10000
+      plugin_configuration:
+        use_paged_context_fmha: true
+      speculative_decoding_mode: DRAFT_TOKENS_EXTERNAL
+      tensor_parallel_count: 1
+    runtime:
+      batch_scheduler_policy: max_utilization
+      enable_chunked_context: true
+      kv_cache_free_gpu_mem_fraction: 0.65
+      request_default_max_tokens: 1000
+  total_token_limit: 500000
diff --git a/text-embeddings-inference/.internal/Dockerfile b/text-embeddings-inference/.internal/Dockerfile
new file mode 100644
index 00000000..de609b3b
--- /dev/null
+++ b/text-embeddings-inference/.internal/Dockerfile
@@ -0,0 +1,9 @@
+ARG TAG=1.6
+# this image builds a truss-compatible image with the text-embeddings-inference image as base
+# it mainly requires python3
+# optional, git and git-lfs are installed to allow for easy cloning of the huggingface model repos.
+FROM ghcr.io/huggingface/text-embeddings-inference:${TAG}
+RUN apt-get update && apt-get install -y python3 python3-pip git git-lfs
+RUN git lfs install
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]
diff --git a/text-embeddings-inference/.internal/roll_out_docker.sh b/text-embeddings-inference/.internal/roll_out_docker.sh
new file mode 100755
index 00000000..16300475
--- /dev/null
+++ b/text-embeddings-inference/.internal/roll_out_docker.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -e
+
+# Map architectures to prefixes
+declare -A ARCHES=(
+  ["cpu"]="cpu-"
+  ["turing"]="turing-"
+  ["ampere80"]=""
+  ["ampere86"]="86-"
+  ["adalovelace"]="89-"
+  ["hopper"]="hopper-"
+)
+
+# Define version and target
+VERSION="1.6"
+TARGET="baseten/text-embeddings-inference-mirror"
+
+# Build and push images
+for ARCH in "${!ARCHES[@]}"; do
+  ARCH_PREFIX=${ARCHES[$ARCH]}
+  TAG="${TARGET}:${ARCH_PREFIX}${VERSION}"
+
+  echo "Building and pushing image for $ARCH: $TAG"
+
+  docker buildx build -t "$TAG" --build-arg TAG="${ARCH_PREFIX}${VERSION}" --push .
+done
+
+echo "All images have been built and pushed."
diff --git a/text-embeddings-inference/README.md b/text-embeddings-inference/README.md
index cfdc1423..4a490264 100644
--- a/text-embeddings-inference/README.md
+++ b/text-embeddings-inference/README.md
@@ -1,32 +1,112 @@
-# Text Embeddings Inference Truss (A100)
-This is an example of a Truss model that uses the Text Embeddings Inference API.
+# Text Embeddings Inference Truss
 
-## How to Deploy
-In the `config.yaml` file, you can specify the model to use, as well as other arguments per the [Text Embeddings Inference API](https://huggingface.co/docs/text-embeddings-inference) documentation.
-Note that not all models are supported by TEI.
+This is a Trussless Customer Server example to deploy [text-embeddings-inference](https://github.com/huggingface/text-embeddings-inference), a high performance server that handles text-embeddings, ranranking and classification models as api.
 
-To run the model, you can use the following command:
-```bash
-truss push
+## Deployment
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+3. [Required for gated model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+
+First, clone this repository:
+
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd text-embeddings-inference
+```
+
+With `text-embeddings-inference` as your working directory, you can deploy the model with the following command, paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+```
+
+## Performance Optimization:
+
+The config.yaml contains a couple of variables that can be tuned, depending on:
+- which GPU is used
+- which model is deployed
+- how many concurrent requests users are sending
+
+The deployment example is for Bert-large and a Nvidia-L4. Bert-large has a maxiumum sequence length of 512 tokens per sentence.
+For Bert-large architecture & the L4, there are marginal gains above a batch-size of 16000 tokens.
+
+### Concurrent requests
+```
+--max-concurrent-requests 40
+# and
+runtime:
+  predict_concurrency : 40
+```
+The following set the number of parallel `post` requests.
+In this case we allow 40 parallel requests to be handled per replica & should allow to batch requests from multiple users together, reaching high token counts. Potentially 40 single parallel requests with one sequence each could fully utilize the GPU. `1*40*512=20480`
+
+
+### Tokens per batch
+```
+--max-batch-tokens 32768
 ```
 
-## How to Generate Embeddings
-The truss expects:
-- "texts" parameter with either a single string or an array of strings.
-- "stream" parameter with a boolean value (default is false).
+This number of total tokens in a batch. For embedding models, this will determine the VRAM usage.
+As most of TEI's models are implemented with `nested` attention implementation, `32768 tokens` could mean `64 sentence with 512 tokens` or `512 sentences with 64 tokens`. While the first will take slightly longer to compute, the peak VRAM usage will stay roughly the same. For `llama` or `mistral` based `7b` embedding models, we recommend setting it a lower setting e.g.
+```
+--max-batch-tokens 8192
+```
+
+### Client batch size
+```
+--max-client-batch-size 32
+```
+Client match size determines the number of sentences in a single request.
+Increase if clients cannot send multiple concurrent requests, or if clients require to larger requests size.
+
+### Endpoint, Model Selection, and OpenAPI
+Change to /rerank or /predict if you want to use the rerank or predict endpoint.
+Embedding model.
+Example supported models: https://huggingface.co/models?pipeline_tag=feature-extraction&other=text-embeddings-inference&sort=trending
+```yaml
+  predict_endpoint: /v1/embeddings
+```
+Rerank model.
+Example models https://huggingface.co/models?pipeline_tag=text-classification&other=text-embeddings-inference&sort=trending
+```yaml
+  predict_endpoint: /rerank
+```
+Classification model:
+Example classification model: https://huggingface.co/SamLowe/roberta-base-go_emotions
+```yaml
+  predict_endpoint: /predict
+```
+
+## Call your model
+
+### curl
 
-To generate embeddings, you can use the following command:
 ```bash
-truss predict --d '{"texts": "This is a test"}'
+curl -X POST https://model-xxx.api.baseten.co/development/predict \
+        -H "Authorization: Api-Key YOUR_API_KEY" \
+        -d '{"input": "text string"}'
 ```
 
-# Notes
-- The base image is created by installing python on one of the images provided here: https://github.com/huggingface/text-embeddings-inference?tab=readme-ov-file. The current example was built for Ampere 80 architecture, which includes the A100.
-- Multi-GPU appears to have no impact on performance
-- Be aware of the token limit for each embedding model. It is currently up to the caller to ensure that the texts do not exceed the token limit.
 
-# Improvements
-- It may be possible to create a universal base image using the `-all` dockerfile to support a GPU-agnostic implementation
-- handle truncation / chunking with averaging (or other technique) when tokens > supported
-- investigate impact of dtype on performance
-- Add prompt support to embed with prompt
+### request python library
+
+```python
+import os
+import requests
+
+resp = requests.post(
+    "https://model-xxx.api.baseten.co/environments/production/predict",
+    headers={"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"},
+    json={"input": ["text string", "second string"]},
+)
+
+print(resp.json())
+```
+
+
+## Support
+
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/text-embeddings-inference/config.yaml b/text-embeddings-inference/config.yaml
index 37d74a70..82332a3c 100644
--- a/text-embeddings-inference/config.yaml
+++ b/text-embeddings-inference/config.yaml
@@ -1,25 +1,30 @@
 base_image:
-  image: vshulman/ampere-truss-custom-text-embeddings-inference:1.0
-  python_executable_path: /usr/bin/python
-build:
-  arguments:
-    model_id: nomic-ai/nomic-embed-text-v1.5
-  model_server: TrussServer
-environment_variables: {}
-external_package_dirs: []
-model_cache:
-- repo_id: nomic-ai/nomic-embed-text-v1.5
-model_metadata: {}
-model_name: TEI Experiment
-python_version: py39
-runtime:
-  predict_concurrency: 512
-requirements: []
+  # select an image: L4
+  # CPU	baseten/text-embeddings-inference-mirror:cpu-1.6
+  # Turing (T4, ...)	baseten/text-embeddings-inference-mirror:turing-1.6
+  # Ampere 80 (A100, A30)	baseten/text-embeddings-inference-mirror:1.6
+  # Ampere 86 (A10, A10G, A40, ...)	baseten/text-embeddings-inference-mirror:86-1.6
+  # Ada Lovelace (L4, ...)	baseten/text-embeddings-inference-mirror:89-1.6
+  # Hopper (H100/H100 40GB)	baseten/text-embeddings-inference-mirror:hopper-1.6
+  image: baseten/text-embeddings-inference-mirror:89-1.6
+model_metadata:
+  repo_id: BAAI/bge-base-en-v1.5
+docker_server:
+  start_command: sh -c "text-embeddings-router --port 7997 --model-id /data/local-model --max-client-batch-size 32 --max-concurrent-requests 40 --max-batch-tokens 32768"
+  readiness_endpoint: /health
+  liveness_endpoint: /health
+  # change to /rerank or /predict if you want to use the rerank or predict endpoint
+  # https://huggingface.github.io/text-embeddings-inference/
+  predict_endpoint: /v1/embeddings
+  server_port: 7997
 resources:
-  accelerator: A100
-  cpu: '1'
-  memory: 2Gi
+  accelerator: L4
   use_gpu: true
-secrets: {}
-system_packages:
-- python3.10-venv
+model_name: text-embeddings-inference trussless
+build_commands: # optional step to download the weights of the model into the image
+- git clone https://huggingface.co/BAAI/bge-base-en-v1.5 /data/local-model
+runtime:
+  predict_concurrency : 40
+environment_variables:
+  VLLM_LOGGING_LEVEL: WARNING
+  hf_access_token: null
diff --git a/text-embeddings-inference/model/__init__.py b/text-embeddings-inference/model/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/text-embeddings-inference/model/model.py b/text-embeddings-inference/model/model.py
deleted file mode 100644
index 95dccc9a..00000000
--- a/text-embeddings-inference/model/model.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import subprocess
-import time
-from typing import Any, Dict, List
-
-import grpc
-from tei_pb import tei_pb2, tei_pb2_grpc
-from tei_pb.text_embeddings_router_config import Config
-
-
-class Model:
-    MAX_FAILED_SECONDS = 600  # 10 minutes; the reason this would take this long is mostly if we download a large model
-
-    def __init__(self, data_dir, config, secrets):
-        self._secrets = secrets
-        self._config = config
-
-    def load(self):
-        config = Config(self._config["build"]["arguments"])
-        config.run_router()
-
-        # Health check loop
-        channel = grpc.insecure_channel("localhost:80")
-        stub = tei_pb2_grpc.InfoStub(channel)
-        healthy = False
-        failed_seconds = 0
-        print(
-            f"Waiting for model to be ready for up to {self.MAX_FAILED_SECONDS} seconds"
-        )
-        while not healthy and failed_seconds < self.MAX_FAILED_SECONDS:
-            try:
-                response = stub.Info(tei_pb2.InfoRequest())
-                if (
-                    response.model_id
-                ):  # Assuming a valid model_id indicates the service is serving
-                    healthy = True
-                    print("Model is ready")
-                else:
-                    failed_seconds += 1
-                    time.sleep(1)  # wait for a second before retrying
-            except grpc.RpcError:
-                failed_seconds += 1
-                time.sleep(1)  # wait and retry if server is not up yet
-
-    async def predict(self, model_input):
-        texts = model_input.pop("texts")
-        stream = model_input.pop("stream", False)
-
-        if isinstance(texts, str):
-            texts = [texts]
-
-        print(f"Starting to embed {len(texts)} texts")
-        requests = [tei_pb2.EmbedRequest(inputs=text) for text in texts]
-
-        async def generator():
-            with grpc.insecure_channel("localhost:80") as channel:
-                stub = tei_pb2_grpc.EmbedStub(channel)
-                responses = stub.EmbedStream(iter(requests))
-                for response in responses:
-                    yield list(response.embeddings)
-
-        if stream:
-            return generator()
-        else:
-            embeddings = []
-            async for embedding in generator():
-                embeddings.append(embedding)
-
-            return {"embeddings": embeddings}
-
-
-if __name__ == "__main__":
-    model = Model()
-    model.load()
-    print(model.predict("What is Deep Learning"))
diff --git a/text-embeddings-inference/packages/tei_pb/tei.proto b/text-embeddings-inference/packages/tei_pb/tei.proto
deleted file mode 100644
index 6538e34a..00000000
--- a/text-embeddings-inference/packages/tei_pb/tei.proto
+++ /dev/null
@@ -1,187 +0,0 @@
-syntax = "proto3";
-
-package tei.v1;
-
-service Info {
-    rpc Info (InfoRequest) returns (InfoResponse) {
-        option idempotency_level = IDEMPOTENT;
-    };
-}
-
-service Embed {
-    rpc Embed (EmbedRequest) returns (EmbedResponse);
-    rpc EmbedStream (stream EmbedRequest) returns (stream EmbedResponse);
-    rpc EmbedSparse (EmbedSparseRequest) returns (EmbedSparseResponse);
-    rpc EmbedSparseStream (stream EmbedSparseRequest) returns (stream EmbedSparseResponse);
-    rpc EmbedAll (EmbedAllRequest) returns (EmbedAllResponse);
-    rpc EmbedAllStream (stream EmbedAllRequest) returns (stream EmbedAllResponse);
-}
-
-service Predict {
-    rpc Predict (PredictRequest) returns (PredictResponse);
-    rpc PredictPair (PredictPairRequest) returns (PredictResponse);
-    rpc PredictStream (stream PredictRequest) returns (stream PredictResponse);
-    rpc PredictPairStream (stream PredictPairRequest) returns (stream PredictResponse);
-}
-
-service Rerank {
-    rpc Rerank (RerankRequest) returns (RerankResponse);
-    rpc RerankStream (stream RerankStreamRequest) returns (RerankResponse);
-}
-
-service Tokenize {
-    rpc Tokenize (EncodeRequest) returns (EncodeResponse);
-    rpc TokenizeStream (stream EncodeRequest) returns (stream EncodeResponse);
-    rpc Decode (DecodeRequest) returns (DecodeResponse);
-    rpc DecodeStream (stream DecodeRequest) returns (stream DecodeResponse);
-}
-
-message InfoRequest {}
-
-enum ModelType {
-    MODEL_TYPE_EMBEDDING = 0;
-    MODEL_TYPE_CLASSIFIER = 1;
-    MODEL_TYPE_RERANKER = 2;
-}
-
-message InfoResponse {
-    string version = 1;
-    optional string sha = 2;
-    optional string docker_label = 3;
-    string model_id = 4;
-    optional string model_sha = 5;
-    string model_dtype = 6;
-    ModelType model_type = 7;
-    uint32 max_concurrent_requests = 8;
-    uint32 max_input_length = 9;
-    uint32 max_batch_tokens = 10;
-    optional uint32 max_batch_requests = 11;
-    uint32 max_client_batch_size = 12;
-    uint32 tokenization_workers = 13;
-}
-
-message Metadata {
-    uint32 compute_chars = 1;
-    uint32 compute_tokens = 2;
-    uint64 total_time_ns = 3;
-    uint64 tokenization_time_ns = 4;
-    uint64 queue_time_ns = 5;
-    uint64 inference_time_ns = 6;
-}
-
-message EmbedRequest {
-    string inputs = 1;
-    bool truncate = 2;
-    bool normalize = 3;
-}
-
-message EmbedResponse {
-    repeated float embeddings = 1;
-    Metadata metadata = 2;
-}
-
-message EmbedSparseRequest {
-    string inputs = 1;
-    bool truncate = 2;
-}
-
-message SparseValue {
-    uint32 index = 1;
-    float value = 2;
-}
-
-message EmbedSparseResponse {
-    repeated SparseValue sparse_embeddings = 1;
-    Metadata metadata = 2;
-}
-
-message EmbedAllRequest {
-    string inputs = 1;
-    bool truncate = 2;
-}
-
-message TokenEmbedding {
-    repeated float embeddings = 1;
-}
-
-message EmbedAllResponse {
-    repeated TokenEmbedding token_embeddings = 1;
-    Metadata metadata = 2;
-}
-
-message PredictRequest {
-    string inputs = 1;
-    bool truncate = 2;
-    bool raw_scores = 3;
-}
-
-message PredictPairRequest {
-    repeated string inputs = 1;
-    bool truncate = 2;
-    bool raw_scores = 3;
-}
-
-message Prediction {
-    float score = 1;
-    string label = 2;
-}
-
-message PredictResponse {
-    repeated Prediction predictions = 1;
-    Metadata metadata = 2;
-}
-
-message RerankRequest {
-    string query = 1;
-    repeated string texts = 2;
-    bool truncate = 3;
-    bool raw_scores = 4;
-    bool return_text = 5;
-}
-
-message RerankStreamRequest{
-    string query = 1;
-    string text = 2;
-    bool truncate = 3;
-    // The server will only consider the first value
-    bool raw_scores = 4;
-    // The server will only consider the first value
-    bool return_text = 5;
-}
-
-message Rank {
-    uint32 index = 1;
-    optional string text = 2;
-    float score = 3;
-}
-
-message RerankResponse {
-    repeated Rank ranks = 1;
-    Metadata metadata = 2;
-}
-
-message EncodeRequest {
-    string inputs = 1;
-    bool add_special_tokens = 2;
-}
-
-message SimpleToken {
-    uint32 id = 1;
-    string text = 2;
-    bool special = 3;
-    optional uint32 start = 4;
-    optional uint32 stop = 5;
-}
-
-message EncodeResponse {
-    repeated SimpleToken tokens = 1;
-}
-
-message DecodeRequest {
-    repeated uint32 ids = 1;
-    bool skip_special_tokens = 2;
-}
-
-message DecodeResponse {
-    string text = 1;
-}
diff --git a/text-embeddings-inference/packages/tei_pb/tei_pb2.py b/text-embeddings-inference/packages/tei_pb/tei_pb2.py
deleted file mode 100644
index 5260d967..00000000
--- a/text-embeddings-inference/packages/tei_pb/tei_pb2.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# -*- coding: utf-8 -*-
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: tei.proto
-# Protobuf Python Version: 5.26.1
-"""Generated protocol buffer code."""
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import descriptor_pool as _descriptor_pool
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf.internal import builder as _builder
-
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\ttei.proto\x12\x06tei.v1"\r\n\x0bInfoRequest"\xa3\x03\n\x0cInfoResponse\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x10\n\x03sha\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\x19\n\x0c\x64ocker_label\x18\x03 \x01(\tH\x01\x88\x01\x01\x12\x10\n\x08model_id\x18\x04 \x01(\t\x12\x16\n\tmodel_sha\x18\x05 \x01(\tH\x02\x88\x01\x01\x12\x13\n\x0bmodel_dtype\x18\x06 \x01(\t\x12%\n\nmodel_type\x18\x07 \x01(\x0e\x32\x11.tei.v1.ModelType\x12\x1f\n\x17max_concurrent_requests\x18\x08 \x01(\r\x12\x18\n\x10max_input_length\x18\t \x01(\r\x12\x18\n\x10max_batch_tokens\x18\n \x01(\r\x12\x1f\n\x12max_batch_requests\x18\x0b \x01(\rH\x03\x88\x01\x01\x12\x1d\n\x15max_client_batch_size\x18\x0c \x01(\r\x12\x1c\n\x14tokenization_workers\x18\r \x01(\rB\x06\n\x04_shaB\x0f\n\r_docker_labelB\x0c\n\n_model_shaB\x15\n\x13_max_batch_requests"\xa0\x01\n\x08Metadata\x12\x15\n\rcompute_chars\x18\x01 \x01(\r\x12\x16\n\x0e\x63ompute_tokens\x18\x02 \x01(\r\x12\x15\n\rtotal_time_ns\x18\x03 \x01(\x04\x12\x1c\n\x14tokenization_time_ns\x18\x04 \x01(\x04\x12\x15\n\rqueue_time_ns\x18\x05 \x01(\x04\x12\x19\n\x11inference_time_ns\x18\x06 \x01(\x04"C\n\x0c\x45mbedRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08\x12\x11\n\tnormalize\x18\x03 \x01(\x08"G\n\rEmbedResponse\x12\x12\n\nembeddings\x18\x01 \x03(\x02\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata"6\n\x12\x45mbedSparseRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08"+\n\x0bSparseValue\x12\r\n\x05index\x18\x01 \x01(\r\x12\r\n\x05value\x18\x02 \x01(\x02"i\n\x13\x45mbedSparseResponse\x12.\n\x11sparse_embeddings\x18\x01 \x03(\x0b\x32\x13.tei.v1.SparseValue\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata"3\n\x0f\x45mbedAllRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08"$\n\x0eTokenEmbedding\x12\x12\n\nembeddings\x18\x01 \x03(\x02"h\n\x10\x45mbedAllResponse\x12\x30\n\x10token_embeddings\x18\x01 \x03(\x0b\x32\x16.tei.v1.TokenEmbedding\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata"F\n\x0ePredictRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08\x12\x12\n\nraw_scores\x18\x03 \x01(\x08"J\n\x12PredictPairRequest\x12\x0e\n\x06inputs\x18\x01 \x03(\t\x12\x10\n\x08truncate\x18\x02 \x01(\x08\x12\x12\n\nraw_scores\x18\x03 \x01(\x08"*\n\nPrediction\x12\r\n\x05score\x18\x01 \x01(\x02\x12\r\n\x05label\x18\x02 \x01(\t"^\n\x0fPredictResponse\x12\'\n\x0bpredictions\x18\x01 \x03(\x0b\x32\x12.tei.v1.Prediction\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata"h\n\rRerankRequest\x12\r\n\x05query\x18\x01 \x01(\t\x12\r\n\x05texts\x18\x02 \x03(\t\x12\x10\n\x08truncate\x18\x03 \x01(\x08\x12\x12\n\nraw_scores\x18\x04 \x01(\x08\x12\x13\n\x0breturn_text\x18\x05 \x01(\x08"m\n\x13RerankStreamRequest\x12\r\n\x05query\x18\x01 \x01(\t\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x10\n\x08truncate\x18\x03 \x01(\x08\x12\x12\n\nraw_scores\x18\x04 \x01(\x08\x12\x13\n\x0breturn_text\x18\x05 \x01(\x08"@\n\x04Rank\x12\r\n\x05index\x18\x01 \x01(\r\x12\x11\n\x04text\x18\x02 \x01(\tH\x00\x88\x01\x01\x12\r\n\x05score\x18\x03 \x01(\x02\x42\x07\n\x05_text"Q\n\x0eRerankResponse\x12\x1b\n\x05ranks\x18\x01 \x03(\x0b\x32\x0c.tei.v1.Rank\x12"\n\x08metadata\x18\x02 \x01(\x0b\x32\x10.tei.v1.Metadata";\n\rEncodeRequest\x12\x0e\n\x06inputs\x18\x01 \x01(\t\x12\x1a\n\x12\x61\x64\x64_special_tokens\x18\x02 \x01(\x08"r\n\x0bSimpleToken\x12\n\n\x02id\x18\x01 \x01(\r\x12\x0c\n\x04text\x18\x02 \x01(\t\x12\x0f\n\x07special\x18\x03 \x01(\x08\x12\x12\n\x05start\x18\x04 \x01(\rH\x00\x88\x01\x01\x12\x11\n\x04stop\x18\x05 \x01(\rH\x01\x88\x01\x01\x42\x08\n\x06_startB\x07\n\x05_stop"5\n\x0e\x45ncodeResponse\x12#\n\x06tokens\x18\x01 \x03(\x0b\x32\x13.tei.v1.SimpleToken"9\n\rDecodeRequest\x12\x0b\n\x03ids\x18\x01 \x03(\r\x12\x1b\n\x13skip_special_tokens\x18\x02 \x01(\x08"\x1e\n\x0e\x44\x65\x63odeResponse\x12\x0c\n\x04text\x18\x01 \x01(\t*Y\n\tModelType\x12\x18\n\x14MODEL_TYPE_EMBEDDING\x10\x00\x12\x19\n\x15MODEL_TYPE_CLASSIFIER\x10\x01\x12\x17\n\x13MODEL_TYPE_RERANKER\x10\x02\x32>\n\x04Info\x12\x36\n\x04Info\x12\x13.tei.v1.InfoRequest\x1a\x14.tei.v1.InfoResponse"\x03\x90\x02\x02\x32\x9f\x03\n\x05\x45mbed\x12\x34\n\x05\x45mbed\x12\x14.tei.v1.EmbedRequest\x1a\x15.tei.v1.EmbedResponse\x12>\n\x0b\x45mbedStream\x12\x14.tei.v1.EmbedRequest\x1a\x15.tei.v1.EmbedResponse(\x01\x30\x01\x12\x46\n\x0b\x45mbedSparse\x12\x1a.tei.v1.EmbedSparseRequest\x1a\x1b.tei.v1.EmbedSparseResponse\x12P\n\x11\x45mbedSparseStream\x12\x1a.tei.v1.EmbedSparseRequest\x1a\x1b.tei.v1.EmbedSparseResponse(\x01\x30\x01\x12=\n\x08\x45mbedAll\x12\x17.tei.v1.EmbedAllRequest\x1a\x18.tei.v1.EmbedAllResponse\x12G\n\x0e\x45mbedAllStream\x12\x17.tei.v1.EmbedAllRequest\x1a\x18.tei.v1.EmbedAllResponse(\x01\x30\x01\x32\x9d\x02\n\x07Predict\x12:\n\x07Predict\x12\x16.tei.v1.PredictRequest\x1a\x17.tei.v1.PredictResponse\x12\x42\n\x0bPredictPair\x12\x1a.tei.v1.PredictPairRequest\x1a\x17.tei.v1.PredictResponse\x12\x44\n\rPredictStream\x12\x16.tei.v1.PredictRequest\x1a\x17.tei.v1.PredictResponse(\x01\x30\x01\x12L\n\x11PredictPairStream\x12\x1a.tei.v1.PredictPairRequest\x1a\x17.tei.v1.PredictResponse(\x01\x30\x01\x32\x88\x01\n\x06Rerank\x12\x37\n\x06Rerank\x12\x15.tei.v1.RerankRequest\x1a\x16.tei.v1.RerankResponse\x12\x45\n\x0cRerankStream\x12\x1b.tei.v1.RerankStreamRequest\x1a\x16.tei.v1.RerankResponse(\x01\x32\x86\x02\n\x08Tokenize\x12\x39\n\x08Tokenize\x12\x15.tei.v1.EncodeRequest\x1a\x16.tei.v1.EncodeResponse\x12\x43\n\x0eTokenizeStream\x12\x15.tei.v1.EncodeRequest\x1a\x16.tei.v1.EncodeResponse(\x01\x30\x01\x12\x37\n\x06\x44\x65\x63ode\x12\x15.tei.v1.DecodeRequest\x1a\x16.tei.v1.DecodeResponse\x12\x41\n\x0c\x44\x65\x63odeStream\x12\x15.tei.v1.DecodeRequest\x1a\x16.tei.v1.DecodeResponse(\x01\x30\x01\x62\x06proto3'
-)
-
-_globals = globals()
-_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
-_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "tei_pb2", _globals)
-if not _descriptor._USE_C_DESCRIPTORS:
-    DESCRIPTOR._loaded_options = None
-    _globals["_INFO"].methods_by_name["Info"]._loaded_options = None
-    _globals["_INFO"].methods_by_name["Info"]._serialized_options = b"\220\002\002"
-    _globals["_MODELTYPE"]._serialized_start = 2145
-    _globals["_MODELTYPE"]._serialized_end = 2234
-    _globals["_INFOREQUEST"]._serialized_start = 21
-    _globals["_INFOREQUEST"]._serialized_end = 34
-    _globals["_INFORESPONSE"]._serialized_start = 37
-    _globals["_INFORESPONSE"]._serialized_end = 456
-    _globals["_METADATA"]._serialized_start = 459
-    _globals["_METADATA"]._serialized_end = 619
-    _globals["_EMBEDREQUEST"]._serialized_start = 621
-    _globals["_EMBEDREQUEST"]._serialized_end = 688
-    _globals["_EMBEDRESPONSE"]._serialized_start = 690
-    _globals["_EMBEDRESPONSE"]._serialized_end = 761
-    _globals["_EMBEDSPARSEREQUEST"]._serialized_start = 763
-    _globals["_EMBEDSPARSEREQUEST"]._serialized_end = 817
-    _globals["_SPARSEVALUE"]._serialized_start = 819
-    _globals["_SPARSEVALUE"]._serialized_end = 862
-    _globals["_EMBEDSPARSERESPONSE"]._serialized_start = 864
-    _globals["_EMBEDSPARSERESPONSE"]._serialized_end = 969
-    _globals["_EMBEDALLREQUEST"]._serialized_start = 971
-    _globals["_EMBEDALLREQUEST"]._serialized_end = 1022
-    _globals["_TOKENEMBEDDING"]._serialized_start = 1024
-    _globals["_TOKENEMBEDDING"]._serialized_end = 1060
-    _globals["_EMBEDALLRESPONSE"]._serialized_start = 1062
-    _globals["_EMBEDALLRESPONSE"]._serialized_end = 1166
-    _globals["_PREDICTREQUEST"]._serialized_start = 1168
-    _globals["_PREDICTREQUEST"]._serialized_end = 1238
-    _globals["_PREDICTPAIRREQUEST"]._serialized_start = 1240
-    _globals["_PREDICTPAIRREQUEST"]._serialized_end = 1314
-    _globals["_PREDICTION"]._serialized_start = 1316
-    _globals["_PREDICTION"]._serialized_end = 1358
-    _globals["_PREDICTRESPONSE"]._serialized_start = 1360
-    _globals["_PREDICTRESPONSE"]._serialized_end = 1454
-    _globals["_RERANKREQUEST"]._serialized_start = 1456
-    _globals["_RERANKREQUEST"]._serialized_end = 1560
-    _globals["_RERANKSTREAMREQUEST"]._serialized_start = 1562
-    _globals["_RERANKSTREAMREQUEST"]._serialized_end = 1671
-    _globals["_RANK"]._serialized_start = 1673
-    _globals["_RANK"]._serialized_end = 1737
-    _globals["_RERANKRESPONSE"]._serialized_start = 1739
-    _globals["_RERANKRESPONSE"]._serialized_end = 1820
-    _globals["_ENCODEREQUEST"]._serialized_start = 1822
-    _globals["_ENCODEREQUEST"]._serialized_end = 1881
-    _globals["_SIMPLETOKEN"]._serialized_start = 1883
-    _globals["_SIMPLETOKEN"]._serialized_end = 1997
-    _globals["_ENCODERESPONSE"]._serialized_start = 1999
-    _globals["_ENCODERESPONSE"]._serialized_end = 2052
-    _globals["_DECODEREQUEST"]._serialized_start = 2054
-    _globals["_DECODEREQUEST"]._serialized_end = 2111
-    _globals["_DECODERESPONSE"]._serialized_start = 2113
-    _globals["_DECODERESPONSE"]._serialized_end = 2143
-    _globals["_INFO"]._serialized_start = 2236
-    _globals["_INFO"]._serialized_end = 2298
-    _globals["_EMBED"]._serialized_start = 2301
-    _globals["_EMBED"]._serialized_end = 2716
-    _globals["_PREDICT"]._serialized_start = 2719
-    _globals["_PREDICT"]._serialized_end = 3004
-    _globals["_RERANK"]._serialized_start = 3007
-    _globals["_RERANK"]._serialized_end = 3143
-    _globals["_TOKENIZE"]._serialized_start = 3146
-    _globals["_TOKENIZE"]._serialized_end = 3408
-# @@protoc_insertion_point(module_scope)
diff --git a/text-embeddings-inference/packages/tei_pb/tei_pb2_grpc.py b/text-embeddings-inference/packages/tei_pb/tei_pb2_grpc.py
deleted file mode 100644
index def8ea5a..00000000
--- a/text-embeddings-inference/packages/tei_pb/tei_pb2_grpc.py
+++ /dev/null
@@ -1,983 +0,0 @@
-# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
-"""Client and server classes corresponding to protobuf-defined services."""
-import warnings
-
-import grpc
-
-from . import tei_pb2 as tei__pb2
-
-GRPC_GENERATED_VERSION = "1.64.1"
-GRPC_VERSION = grpc.__version__
-EXPECTED_ERROR_RELEASE = "1.65.0"
-SCHEDULED_RELEASE_DATE = "June 25, 2024"
-_version_not_supported = False
-
-try:
-    from grpc._utilities import first_version_is_lower
-
-    _version_not_supported = first_version_is_lower(
-        GRPC_VERSION, GRPC_GENERATED_VERSION
-    )
-except ImportError:
-    _version_not_supported = True
-
-if _version_not_supported:
-    warnings.warn(
-        f"The grpc package installed is at version {GRPC_VERSION},"
-        + f" but the generated code in tei_pb2_grpc.py depends on"
-        + f" grpcio>={GRPC_GENERATED_VERSION}."
-        + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}"
-        + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}."
-        + f" This warning will become an error in {EXPECTED_ERROR_RELEASE},"
-        + f" scheduled for release on {SCHEDULED_RELEASE_DATE}.",
-        RuntimeWarning,
-    )
-
-
-class InfoStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Info = channel.unary_unary(
-            "/tei.v1.Info/Info",
-            request_serializer=tei__pb2.InfoRequest.SerializeToString,
-            response_deserializer=tei__pb2.InfoResponse.FromString,
-            _registered_method=True,
-        )
-
-
-class InfoServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Info(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-
-def add_InfoServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-        "Info": grpc.unary_unary_rpc_method_handler(
-            servicer.Info,
-            request_deserializer=tei__pb2.InfoRequest.FromString,
-            response_serializer=tei__pb2.InfoResponse.SerializeToString,
-        ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-        "tei.v1.Info", rpc_method_handlers
-    )
-    server.add_generic_rpc_handlers((generic_handler,))
-    server.add_registered_method_handlers("tei.v1.Info", rpc_method_handlers)
-
-
-# This class is part of an EXPERIMENTAL API.
-class Info(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Info(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Info/Info",
-            tei__pb2.InfoRequest.SerializeToString,
-            tei__pb2.InfoResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-
-class EmbedStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Embed = channel.unary_unary(
-            "/tei.v1.Embed/Embed",
-            request_serializer=tei__pb2.EmbedRequest.SerializeToString,
-            response_deserializer=tei__pb2.EmbedResponse.FromString,
-            _registered_method=True,
-        )
-        self.EmbedStream = channel.stream_stream(
-            "/tei.v1.Embed/EmbedStream",
-            request_serializer=tei__pb2.EmbedRequest.SerializeToString,
-            response_deserializer=tei__pb2.EmbedResponse.FromString,
-            _registered_method=True,
-        )
-        self.EmbedSparse = channel.unary_unary(
-            "/tei.v1.Embed/EmbedSparse",
-            request_serializer=tei__pb2.EmbedSparseRequest.SerializeToString,
-            response_deserializer=tei__pb2.EmbedSparseResponse.FromString,
-            _registered_method=True,
-        )
-        self.EmbedSparseStream = channel.stream_stream(
-            "/tei.v1.Embed/EmbedSparseStream",
-            request_serializer=tei__pb2.EmbedSparseRequest.SerializeToString,
-            response_deserializer=tei__pb2.EmbedSparseResponse.FromString,
-            _registered_method=True,
-        )
-        self.EmbedAll = channel.unary_unary(
-            "/tei.v1.Embed/EmbedAll",
-            request_serializer=tei__pb2.EmbedAllRequest.SerializeToString,
-            response_deserializer=tei__pb2.EmbedAllResponse.FromString,
-            _registered_method=True,
-        )
-        self.EmbedAllStream = channel.stream_stream(
-            "/tei.v1.Embed/EmbedAllStream",
-            request_serializer=tei__pb2.EmbedAllRequest.SerializeToString,
-            response_deserializer=tei__pb2.EmbedAllResponse.FromString,
-            _registered_method=True,
-        )
-
-
-class EmbedServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Embed(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def EmbedStream(self, request_iterator, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def EmbedSparse(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def EmbedSparseStream(self, request_iterator, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def EmbedAll(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def EmbedAllStream(self, request_iterator, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-
-def add_EmbedServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-        "Embed": grpc.unary_unary_rpc_method_handler(
-            servicer.Embed,
-            request_deserializer=tei__pb2.EmbedRequest.FromString,
-            response_serializer=tei__pb2.EmbedResponse.SerializeToString,
-        ),
-        "EmbedStream": grpc.stream_stream_rpc_method_handler(
-            servicer.EmbedStream,
-            request_deserializer=tei__pb2.EmbedRequest.FromString,
-            response_serializer=tei__pb2.EmbedResponse.SerializeToString,
-        ),
-        "EmbedSparse": grpc.unary_unary_rpc_method_handler(
-            servicer.EmbedSparse,
-            request_deserializer=tei__pb2.EmbedSparseRequest.FromString,
-            response_serializer=tei__pb2.EmbedSparseResponse.SerializeToString,
-        ),
-        "EmbedSparseStream": grpc.stream_stream_rpc_method_handler(
-            servicer.EmbedSparseStream,
-            request_deserializer=tei__pb2.EmbedSparseRequest.FromString,
-            response_serializer=tei__pb2.EmbedSparseResponse.SerializeToString,
-        ),
-        "EmbedAll": grpc.unary_unary_rpc_method_handler(
-            servicer.EmbedAll,
-            request_deserializer=tei__pb2.EmbedAllRequest.FromString,
-            response_serializer=tei__pb2.EmbedAllResponse.SerializeToString,
-        ),
-        "EmbedAllStream": grpc.stream_stream_rpc_method_handler(
-            servicer.EmbedAllStream,
-            request_deserializer=tei__pb2.EmbedAllRequest.FromString,
-            response_serializer=tei__pb2.EmbedAllResponse.SerializeToString,
-        ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-        "tei.v1.Embed", rpc_method_handlers
-    )
-    server.add_generic_rpc_handlers((generic_handler,))
-    server.add_registered_method_handlers("tei.v1.Embed", rpc_method_handlers)
-
-
-# This class is part of an EXPERIMENTAL API.
-class Embed(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Embed(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Embed/Embed",
-            tei__pb2.EmbedRequest.SerializeToString,
-            tei__pb2.EmbedResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def EmbedStream(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_stream(
-            request_iterator,
-            target,
-            "/tei.v1.Embed/EmbedStream",
-            tei__pb2.EmbedRequest.SerializeToString,
-            tei__pb2.EmbedResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def EmbedSparse(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Embed/EmbedSparse",
-            tei__pb2.EmbedSparseRequest.SerializeToString,
-            tei__pb2.EmbedSparseResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def EmbedSparseStream(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_stream(
-            request_iterator,
-            target,
-            "/tei.v1.Embed/EmbedSparseStream",
-            tei__pb2.EmbedSparseRequest.SerializeToString,
-            tei__pb2.EmbedSparseResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def EmbedAll(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Embed/EmbedAll",
-            tei__pb2.EmbedAllRequest.SerializeToString,
-            tei__pb2.EmbedAllResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def EmbedAllStream(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_stream(
-            request_iterator,
-            target,
-            "/tei.v1.Embed/EmbedAllStream",
-            tei__pb2.EmbedAllRequest.SerializeToString,
-            tei__pb2.EmbedAllResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-
-class PredictStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Predict = channel.unary_unary(
-            "/tei.v1.Predict/Predict",
-            request_serializer=tei__pb2.PredictRequest.SerializeToString,
-            response_deserializer=tei__pb2.PredictResponse.FromString,
-            _registered_method=True,
-        )
-        self.PredictPair = channel.unary_unary(
-            "/tei.v1.Predict/PredictPair",
-            request_serializer=tei__pb2.PredictPairRequest.SerializeToString,
-            response_deserializer=tei__pb2.PredictResponse.FromString,
-            _registered_method=True,
-        )
-        self.PredictStream = channel.stream_stream(
-            "/tei.v1.Predict/PredictStream",
-            request_serializer=tei__pb2.PredictRequest.SerializeToString,
-            response_deserializer=tei__pb2.PredictResponse.FromString,
-            _registered_method=True,
-        )
-        self.PredictPairStream = channel.stream_stream(
-            "/tei.v1.Predict/PredictPairStream",
-            request_serializer=tei__pb2.PredictPairRequest.SerializeToString,
-            response_deserializer=tei__pb2.PredictResponse.FromString,
-            _registered_method=True,
-        )
-
-
-class PredictServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Predict(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def PredictPair(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def PredictStream(self, request_iterator, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def PredictPairStream(self, request_iterator, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-
-def add_PredictServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-        "Predict": grpc.unary_unary_rpc_method_handler(
-            servicer.Predict,
-            request_deserializer=tei__pb2.PredictRequest.FromString,
-            response_serializer=tei__pb2.PredictResponse.SerializeToString,
-        ),
-        "PredictPair": grpc.unary_unary_rpc_method_handler(
-            servicer.PredictPair,
-            request_deserializer=tei__pb2.PredictPairRequest.FromString,
-            response_serializer=tei__pb2.PredictResponse.SerializeToString,
-        ),
-        "PredictStream": grpc.stream_stream_rpc_method_handler(
-            servicer.PredictStream,
-            request_deserializer=tei__pb2.PredictRequest.FromString,
-            response_serializer=tei__pb2.PredictResponse.SerializeToString,
-        ),
-        "PredictPairStream": grpc.stream_stream_rpc_method_handler(
-            servicer.PredictPairStream,
-            request_deserializer=tei__pb2.PredictPairRequest.FromString,
-            response_serializer=tei__pb2.PredictResponse.SerializeToString,
-        ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-        "tei.v1.Predict", rpc_method_handlers
-    )
-    server.add_generic_rpc_handlers((generic_handler,))
-    server.add_registered_method_handlers("tei.v1.Predict", rpc_method_handlers)
-
-
-# This class is part of an EXPERIMENTAL API.
-class Predict(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Predict(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Predict/Predict",
-            tei__pb2.PredictRequest.SerializeToString,
-            tei__pb2.PredictResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def PredictPair(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Predict/PredictPair",
-            tei__pb2.PredictPairRequest.SerializeToString,
-            tei__pb2.PredictResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def PredictStream(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_stream(
-            request_iterator,
-            target,
-            "/tei.v1.Predict/PredictStream",
-            tei__pb2.PredictRequest.SerializeToString,
-            tei__pb2.PredictResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def PredictPairStream(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_stream(
-            request_iterator,
-            target,
-            "/tei.v1.Predict/PredictPairStream",
-            tei__pb2.PredictPairRequest.SerializeToString,
-            tei__pb2.PredictResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-
-class RerankStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Rerank = channel.unary_unary(
-            "/tei.v1.Rerank/Rerank",
-            request_serializer=tei__pb2.RerankRequest.SerializeToString,
-            response_deserializer=tei__pb2.RerankResponse.FromString,
-            _registered_method=True,
-        )
-        self.RerankStream = channel.stream_unary(
-            "/tei.v1.Rerank/RerankStream",
-            request_serializer=tei__pb2.RerankStreamRequest.SerializeToString,
-            response_deserializer=tei__pb2.RerankResponse.FromString,
-            _registered_method=True,
-        )
-
-
-class RerankServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Rerank(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def RerankStream(self, request_iterator, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-
-def add_RerankServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-        "Rerank": grpc.unary_unary_rpc_method_handler(
-            servicer.Rerank,
-            request_deserializer=tei__pb2.RerankRequest.FromString,
-            response_serializer=tei__pb2.RerankResponse.SerializeToString,
-        ),
-        "RerankStream": grpc.stream_unary_rpc_method_handler(
-            servicer.RerankStream,
-            request_deserializer=tei__pb2.RerankStreamRequest.FromString,
-            response_serializer=tei__pb2.RerankResponse.SerializeToString,
-        ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-        "tei.v1.Rerank", rpc_method_handlers
-    )
-    server.add_generic_rpc_handlers((generic_handler,))
-    server.add_registered_method_handlers("tei.v1.Rerank", rpc_method_handlers)
-
-
-# This class is part of an EXPERIMENTAL API.
-class Rerank(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Rerank(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Rerank/Rerank",
-            tei__pb2.RerankRequest.SerializeToString,
-            tei__pb2.RerankResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def RerankStream(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_unary(
-            request_iterator,
-            target,
-            "/tei.v1.Rerank/RerankStream",
-            tei__pb2.RerankStreamRequest.SerializeToString,
-            tei__pb2.RerankResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-
-class TokenizeStub(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def __init__(self, channel):
-        """Constructor.
-
-        Args:
-            channel: A grpc.Channel.
-        """
-        self.Tokenize = channel.unary_unary(
-            "/tei.v1.Tokenize/Tokenize",
-            request_serializer=tei__pb2.EncodeRequest.SerializeToString,
-            response_deserializer=tei__pb2.EncodeResponse.FromString,
-            _registered_method=True,
-        )
-        self.TokenizeStream = channel.stream_stream(
-            "/tei.v1.Tokenize/TokenizeStream",
-            request_serializer=tei__pb2.EncodeRequest.SerializeToString,
-            response_deserializer=tei__pb2.EncodeResponse.FromString,
-            _registered_method=True,
-        )
-        self.Decode = channel.unary_unary(
-            "/tei.v1.Tokenize/Decode",
-            request_serializer=tei__pb2.DecodeRequest.SerializeToString,
-            response_deserializer=tei__pb2.DecodeResponse.FromString,
-            _registered_method=True,
-        )
-        self.DecodeStream = channel.stream_stream(
-            "/tei.v1.Tokenize/DecodeStream",
-            request_serializer=tei__pb2.DecodeRequest.SerializeToString,
-            response_deserializer=tei__pb2.DecodeResponse.FromString,
-            _registered_method=True,
-        )
-
-
-class TokenizeServicer(object):
-    """Missing associated documentation comment in .proto file."""
-
-    def Tokenize(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def TokenizeStream(self, request_iterator, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def Decode(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-    def DecodeStream(self, request_iterator, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details("Method not implemented!")
-        raise NotImplementedError("Method not implemented!")
-
-
-def add_TokenizeServicer_to_server(servicer, server):
-    rpc_method_handlers = {
-        "Tokenize": grpc.unary_unary_rpc_method_handler(
-            servicer.Tokenize,
-            request_deserializer=tei__pb2.EncodeRequest.FromString,
-            response_serializer=tei__pb2.EncodeResponse.SerializeToString,
-        ),
-        "TokenizeStream": grpc.stream_stream_rpc_method_handler(
-            servicer.TokenizeStream,
-            request_deserializer=tei__pb2.EncodeRequest.FromString,
-            response_serializer=tei__pb2.EncodeResponse.SerializeToString,
-        ),
-        "Decode": grpc.unary_unary_rpc_method_handler(
-            servicer.Decode,
-            request_deserializer=tei__pb2.DecodeRequest.FromString,
-            response_serializer=tei__pb2.DecodeResponse.SerializeToString,
-        ),
-        "DecodeStream": grpc.stream_stream_rpc_method_handler(
-            servicer.DecodeStream,
-            request_deserializer=tei__pb2.DecodeRequest.FromString,
-            response_serializer=tei__pb2.DecodeResponse.SerializeToString,
-        ),
-    }
-    generic_handler = grpc.method_handlers_generic_handler(
-        "tei.v1.Tokenize", rpc_method_handlers
-    )
-    server.add_generic_rpc_handlers((generic_handler,))
-    server.add_registered_method_handlers("tei.v1.Tokenize", rpc_method_handlers)
-
-
-# This class is part of an EXPERIMENTAL API.
-class Tokenize(object):
-    """Missing associated documentation comment in .proto file."""
-
-    @staticmethod
-    def Tokenize(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Tokenize/Tokenize",
-            tei__pb2.EncodeRequest.SerializeToString,
-            tei__pb2.EncodeResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def TokenizeStream(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_stream(
-            request_iterator,
-            target,
-            "/tei.v1.Tokenize/TokenizeStream",
-            tei__pb2.EncodeRequest.SerializeToString,
-            tei__pb2.EncodeResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def Decode(
-        request,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            "/tei.v1.Tokenize/Decode",
-            tei__pb2.DecodeRequest.SerializeToString,
-            tei__pb2.DecodeResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
-
-    @staticmethod
-    def DecodeStream(
-        request_iterator,
-        target,
-        options=(),
-        channel_credentials=None,
-        call_credentials=None,
-        insecure=False,
-        compression=None,
-        wait_for_ready=None,
-        timeout=None,
-        metadata=None,
-    ):
-        return grpc.experimental.stream_stream(
-            request_iterator,
-            target,
-            "/tei.v1.Tokenize/DecodeStream",
-            tei__pb2.DecodeRequest.SerializeToString,
-            tei__pb2.DecodeResponse.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata,
-            _registered_method=True,
-        )
diff --git a/text-embeddings-inference/packages/tei_pb/text_embeddings_router_config.py b/text-embeddings-inference/packages/tei_pb/text_embeddings_router_config.py
deleted file mode 100644
index b5764655..00000000
--- a/text-embeddings-inference/packages/tei_pb/text_embeddings_router_config.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import subprocess
-from typing import Any, Dict, List, Optional
-
-
-class Config:
-    def __init__(self, config: Dict[str, Any]):
-        self.config = config
-        self._validate_config()
-
-    def _validate_config(self):
-        required_keys = ["model_id"]
-        for key in required_keys:
-            if key not in self.config:
-                raise ValueError(f"Missing required configuration key: {key}")
-
-    def get_command(self) -> List[str]:
-        command = [
-            "/usr/local/bin/text-embeddings-router",
-            "--port",
-            "80",
-            "--model-id",
-            self.config["model_id"],
-        ]
-
-        optional_params = {
-            "revision": "--revision",
-            "tokenization_workers": "--tokenization-workers",
-            "dtype": "--dtype",
-            "pooling": "--pooling",
-            "max_concurrent_requests": "--max-concurrent-requests",
-            "max_batch_tokens": "--max-batch-tokens",
-            "max_batch_requests": "--max-batch-requests",
-            "max_client_batch_size": "--max-client-batch-size",
-            "hf_api_token": "--hf-api-token",
-            "uds_path": "--uds-path",
-            "huggingface_hub_cache": "--huggingface-hub-cache",
-            "payload_limit": "--payload-limit",
-            "api_key": "--api-key",
-            "json_output": "--json-output",
-            "otlp_endpoint": "--otlp-endpoint",
-        }
-
-        for key, param in optional_params.items():
-            value = self.config.get(key)
-            if value is not None:
-                command.extend([param, str(value)])
-
-        return command
-
-    def run_router(self):
-        command = self.get_command()
-        with open("/var/log/text_embeddings_router.log", "w") as log_file:
-            subprocess.Popen(command, stdout=log_file, stderr=log_file)