Unverified Commit 4eb25632 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: use consistent small models across all deploy examples (#2573)

parent 26b3b609
......@@ -193,7 +193,7 @@ Send a test request to verify your deployment:
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
......
......@@ -32,8 +32,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
......
......@@ -35,8 +35,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
......
......@@ -68,8 +68,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path meta-llama/Llama-3.3-70B-Instruct
--served-model-name meta-llama/Llama-3.3-70B-Instruct
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--tp-size 8
--trust-remote-code
--skip-tokenizer-init
......
......@@ -32,8 +32,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
......@@ -59,8 +59,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
......
......@@ -116,8 +116,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
......@@ -142,8 +142,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
......
......@@ -11,7 +11,7 @@ This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dyna
```bash
python -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--host 0.0.0.0 --port 8000 \
--page-size 64 \
--enable-hierarchical-cache \
......@@ -39,7 +39,7 @@ python -m dynamo.frontend --http-port 8000
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
......@@ -56,7 +56,7 @@ curl localhost:8000/v1/chat/completions \
Run the perf script:
```bash
bash -x /workspace/benchmarks/llm/perf.sh \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model Qwen/Qwen3-0.6B \
--tensor-parallelism 1 \
--data-parallelism 1 \
--concurrency "2,4,8" \
......
......@@ -20,8 +20,8 @@ DYNAMO_PID=$!
# run worker
python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
......
......@@ -20,8 +20,8 @@ DYNAMO_PID=$!
# run worker
python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
......@@ -30,8 +30,8 @@ python3 -m dynamo.sglang \
WORKER_PID=$!
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
......
......@@ -20,8 +20,8 @@ DYNAMO_PID=$!
# run prefill worker
python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
......@@ -32,8 +32,8 @@ PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
......
......@@ -32,6 +32,6 @@ spec:
args:
- >-
python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--extra-engine-args engine_configs/agg.yaml
......@@ -35,7 +35,7 @@ spec:
args:
- >-
python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--extra-engine-args engine_configs/agg.yaml
--publish-events-and-metrics
......@@ -30,7 +30,7 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
- "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-disagg
envFromSecret: hf-token-secret
......@@ -47,4 +47,4 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
- "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
......@@ -33,7 +33,7 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
- "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-v1-disagg-router
envFromSecret: hf-token-secret
......@@ -50,4 +50,4 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
- "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
......@@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal"
......
......@@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
# Setup cleanup trap
......
......@@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}
......
......@@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment