Skip to content

Commit

Permalink
add deepseek engines (#400)
Browse files Browse the repository at this point in the history
Add deepseek qwen and llama engines via engine builder.

Manually tested and confirmed working
  • Loading branch information
philipkiely-baseten authored Jan 21, 2025
1 parent efbba09 commit d0cc257
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 0 deletions.
1 change: 1 addition & 0 deletions deepseek/engine-deepseek-r1-distill-llama-70b/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# DeepSeek-R1 Distill Llama 70B
53 changes: 53 additions & 0 deletions deepseek/engine-deepseek-r1-distill-llama-70b/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
build_commands: []
environment_variables: {}
external_package_dirs: []
model_metadata:
example_model_input: {
messages: [
{
role: "system",
content: "You are a helpful and harmless assistant. You are Llama developed by Meta. You should think step-by-step."
},
{
role: "user",
content: "Which is heavier, a pound of bricks or a pound of feathers?"
}
],
stream: true,
max_tokens: 1024,
temperature: 0.6,
top_p: 1.0,
top_k: 40,
frequency_penalty: 1
}
repo_id: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
model_name: DeepSeek R1 Distill Llama 70B
python_version: py39
requirements: []
resources:
accelerator: H100:2
cpu: '1'
memory: 24Gi
use_gpu: true
secrets:
hf_access_token: set token in baseten workspace
system_packages: []
trt_llm:
build:
base_model: llama
checkpoint_repository:
repo: deepseek-ai/DeepSeek-R1-Distill-Llama-70B
source: HF
num_builder_gpus: 4
quantization_type: fp8_kv
max_seq_len: 32768
tensor_parallel_count: 2
plugin_configuration:
use_paged_context_fmha: true
use_fp8_context_fmha: true
paged_kv_cache: true
runtime:
batch_scheduler_policy: max_utilization
enable_chunked_context: true
kv_cache_free_gpu_mem_fraction: 0.85
request_default_max_tokens: 32768
1 change: 1 addition & 0 deletions deepseek/engine-deepseek-r1-distill-qwen-32b/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# DeepSeek-R1 Distill Qwen 32B
49 changes: 49 additions & 0 deletions deepseek/engine-deepseek-r1-distill-qwen-32b/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
build_commands: []
environment_variables: {}
external_package_dirs: []
model_metadata:
example_model_input: {
messages: [
{
role: "system",
content: "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."
},
{
role: "user",
content: "Which is heavier, a pound of bricks or a pound of feathers?"
}
],
stream: true,
max_tokens: 1024,
temperature: 0.6
}
repo_id: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
model_name: DeepSeek R1 Distill Qwen 32B
python_version: py39
requirements: []
resources:
accelerator: H100
cpu: '1'
memory: 24Gi
use_gpu: true
secrets: {}
system_packages: []
trt_llm:
build:
base_model: qwen
checkpoint_repository:
repo: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
source: HF
num_builder_gpus: 2
quantization_type: fp8_kv
max_seq_len: 32768
tensor_parallel_count: 1
plugin_configuration:
use_paged_context_fmha: true
use_fp8_context_fmha: true
paged_kv_cache: true
runtime:
batch_scheduler_policy: max_utilization
enable_chunked_context: true
kv_cache_free_gpu_mem_fraction: 0.85
request_default_max_tokens: 32768

0 comments on commit d0cc257

Please sign in to comment.