add deepseek engines (#400)

Add deepseek qwen and llama engines via engine builder. Manually tested and confirmed working
basetenlabs · Jan 21, 2025 · d0cc257 · d0cc257
1 parent efbba09
commit d0cc257
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 0 deletions.
diff --git a/deepseek/engine-deepseek-r1-distill-llama-70b/README.md b/deepseek/engine-deepseek-r1-distill-llama-70b/README.md
@@ -0,0 +1 @@
+# DeepSeek-R1 Distill Llama 70B
diff --git a/deepseek/engine-deepseek-r1-distill-llama-70b/config.yaml b/deepseek/engine-deepseek-r1-distill-llama-70b/config.yaml
@@ -0,0 +1,53 @@
+build_commands: []
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {
+    messages: [
+      {
+        role: "system",
+        content: "You are a helpful and harmless assistant. You are Llama developed by Meta. You should think step-by-step."
+      },
+      {
+        role: "user",
+        content: "Which is heavier, a pound of bricks or a pound of feathers?"
+      }
+    ],
+    stream: true,
+    max_tokens: 1024,
+    temperature: 0.6,
+    top_p: 1.0,
+    top_k: 40,
+    frequency_penalty: 1
+  }
+  repo_id: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+model_name: DeepSeek R1 Distill Llama 70B
+python_version: py39
+requirements: []
+resources:
+  accelerator: H100:2
+  cpu: '1'
+  memory: 24Gi
+  use_gpu: true
+secrets:
+  hf_access_token: set token in baseten workspace
+system_packages: []
+trt_llm:
+  build:
+    base_model: llama
+    checkpoint_repository:
+      repo: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
+      source: HF
+    num_builder_gpus: 4
+    quantization_type: fp8_kv
+    max_seq_len: 32768
+    tensor_parallel_count: 2
+    plugin_configuration:
+      use_paged_context_fmha: true
+      use_fp8_context_fmha: true
+      paged_kv_cache: true
+  runtime:
+    batch_scheduler_policy: max_utilization
+    enable_chunked_context: true
+    kv_cache_free_gpu_mem_fraction: 0.85
+    request_default_max_tokens: 32768
diff --git a/deepseek/engine-deepseek-r1-distill-qwen-32b/README.md b/deepseek/engine-deepseek-r1-distill-qwen-32b/README.md
@@ -0,0 +1 @@
+# DeepSeek-R1 Distill Qwen 32B
diff --git a/deepseek/engine-deepseek-r1-distill-qwen-32b/config.yaml b/deepseek/engine-deepseek-r1-distill-qwen-32b/config.yaml
@@ -0,0 +1,49 @@
+build_commands: []
+environment_variables: {}
+external_package_dirs: []
+model_metadata:
+  example_model_input: {
+    messages: [
+      {
+        role: "system",
+        content: "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."
+      },
+      {
+        role: "user",
+        content: "Which is heavier, a pound of bricks or a pound of feathers?"
+      }
+    ],
+    stream: true,
+    max_tokens: 1024,
+    temperature: 0.6
+  }
+  repo_id: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+model_name: DeepSeek R1 Distill Qwen 32B
+python_version: py39
+requirements: []
+resources:
+  accelerator: H100
+  cpu: '1'
+  memory: 24Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
+trt_llm:
+  build:
+    base_model: qwen
+    checkpoint_repository:
+      repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
+      source: HF
+    num_builder_gpus: 2
+    quantization_type: fp8_kv
+    max_seq_len: 32768
+    tensor_parallel_count: 1
+    plugin_configuration:
+      use_paged_context_fmha: true
+      use_fp8_context_fmha: true
+      paged_kv_cache: true
+  runtime:
+    batch_scheduler_policy: max_utilization
+    enable_chunked_context: true
+    kv_cache_free_gpu_mem_fraction: 0.85
+    request_default_max_tokens: 32768