Unverified Commit 4eb25632 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: use consistent small models across all deploy examples (#2573)

parent 26b3b609
...@@ -193,7 +193,7 @@ Send a test request to verify your deployment: ...@@ -193,7 +193,7 @@ Send a test request to verify your deployment:
curl localhost:8000/v1/chat/completions \ curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "model": "Qwen/Qwen3-0.6B",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
......
...@@ -32,8 +32,8 @@ spec: ...@@ -32,8 +32,8 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.sglang python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path Qwen/Qwen3-0.6B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name Qwen/Qwen3-0.6B
--page-size 16 --page-size 16
--tp 1 --tp 1
--trust-remote-code --trust-remote-code
......
...@@ -35,8 +35,8 @@ spec: ...@@ -35,8 +35,8 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.sglang python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path Qwen/Qwen3-0.6B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name Qwen/Qwen3-0.6B
--page-size 16 --page-size 16
--tp 1 --tp 1
--trust-remote-code --trust-remote-code
......
...@@ -68,8 +68,8 @@ spec: ...@@ -68,8 +68,8 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.sglang python3 -m dynamo.sglang
--model-path meta-llama/Llama-3.3-70B-Instruct --model-path Qwen/Qwen3-0.6B
--served-model-name meta-llama/Llama-3.3-70B-Instruct --served-model-name Qwen/Qwen3-0.6B
--tp-size 8 --tp-size 8
--trust-remote-code --trust-remote-code
--skip-tokenizer-init --skip-tokenizer-init
......
...@@ -32,8 +32,8 @@ spec: ...@@ -32,8 +32,8 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.sglang python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path Qwen/Qwen3-0.6B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name Qwen/Qwen3-0.6B
--page-size 16 --page-size 16
--tp 1 --tp 1
--trust-remote-code --trust-remote-code
...@@ -59,8 +59,8 @@ spec: ...@@ -59,8 +59,8 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.sglang python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path Qwen/Qwen3-0.6B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name Qwen/Qwen3-0.6B
--page-size 16 --page-size 16
--tp 1 --tp 1
--trust-remote-code --trust-remote-code
......
...@@ -116,8 +116,8 @@ spec: ...@@ -116,8 +116,8 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.sglang python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path Qwen/Qwen3-0.6B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name Qwen/Qwen3-0.6B
--page-size 16 --page-size 16
--tp 1 --tp 1
--trust-remote-code --trust-remote-code
...@@ -142,8 +142,8 @@ spec: ...@@ -142,8 +142,8 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.sglang python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path Qwen/Qwen3-0.6B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name Qwen/Qwen3-0.6B
--page-size 16 --page-size 16
--tp 1 --tp 1
--trust-remote-code --trust-remote-code
......
...@@ -11,7 +11,7 @@ This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dyna ...@@ -11,7 +11,7 @@ This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dyna
```bash ```bash
python -m dynamo.sglang \ python -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path Qwen/Qwen3-0.6B \
--host 0.0.0.0 --port 8000 \ --host 0.0.0.0 --port 8000 \
--page-size 64 \ --page-size 64 \
--enable-hierarchical-cache \ --enable-hierarchical-cache \
...@@ -39,7 +39,7 @@ python -m dynamo.frontend --http-port 8000 ...@@ -39,7 +39,7 @@ python -m dynamo.frontend --http-port 8000
curl localhost:8000/v1/chat/completions \ curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", "model": "Qwen/Qwen3-0.6B",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
...@@ -56,7 +56,7 @@ curl localhost:8000/v1/chat/completions \ ...@@ -56,7 +56,7 @@ curl localhost:8000/v1/chat/completions \
Run the perf script: Run the perf script:
```bash ```bash
bash -x /workspace/benchmarks/llm/perf.sh \ bash -x /workspace/benchmarks/llm/perf.sh \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model Qwen/Qwen3-0.6B \
--tensor-parallelism 1 \ --tensor-parallelism 1 \
--data-parallelism 1 \ --data-parallelism 1 \
--concurrency "2,4,8" \ --concurrency "2,4,8" \
......
...@@ -20,8 +20,8 @@ DYNAMO_PID=$! ...@@ -20,8 +20,8 @@ DYNAMO_PID=$!
# run worker # run worker
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
......
...@@ -20,8 +20,8 @@ DYNAMO_PID=$! ...@@ -20,8 +20,8 @@ DYNAMO_PID=$!
# run worker # run worker
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
...@@ -30,8 +30,8 @@ python3 -m dynamo.sglang \ ...@@ -30,8 +30,8 @@ python3 -m dynamo.sglang \
WORKER_PID=$! WORKER_PID=$!
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
......
...@@ -20,8 +20,8 @@ DYNAMO_PID=$! ...@@ -20,8 +20,8 @@ DYNAMO_PID=$!
# run prefill worker # run prefill worker
python3 -m dynamo.sglang \ python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
...@@ -32,8 +32,8 @@ PREFILL_PID=$! ...@@ -32,8 +32,8 @@ PREFILL_PID=$!
# run decode worker # run decode worker
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --model-path Qwen/Qwen3-0.6B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \ --served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \ --page-size 16 \
--tp 1 \ --tp 1 \
--trust-remote-code \ --trust-remote-code \
......
...@@ -32,6 +32,6 @@ spec: ...@@ -32,6 +32,6 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.trtllm python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path Qwen/Qwen3-0.6B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name Qwen/Qwen3-0.6B
--extra-engine-args engine_configs/agg.yaml --extra-engine-args engine_configs/agg.yaml
...@@ -35,7 +35,7 @@ spec: ...@@ -35,7 +35,7 @@ spec:
args: args:
- >- - >-
python3 -m dynamo.trtllm python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --model-path Qwen/Qwen3-0.6B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name Qwen/Qwen3-0.6B
--extra-engine-args engine_configs/agg.yaml --extra-engine-args engine_configs/agg.yaml
--publish-events-and-metrics --publish-events-and-metrics
...@@ -30,7 +30,7 @@ spec: ...@@ -30,7 +30,7 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first" - "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
TRTLLMDecodeWorker: TRTLLMDecodeWorker:
dynamoNamespace: trtllm-disagg dynamoNamespace: trtllm-disagg
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
...@@ -47,4 +47,4 @@ spec: ...@@ -47,4 +47,4 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first" - "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
...@@ -33,7 +33,7 @@ spec: ...@@ -33,7 +33,7 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics" - "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
TRTLLMDecodeWorker: TRTLLMDecodeWorker:
dynamoNamespace: trtllm-v1-disagg-router dynamoNamespace: trtllm-v1-disagg-router
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
...@@ -50,4 +50,4 @@ spec: ...@@ -50,4 +50,4 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first" - "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
export MODALITY=${MODALITY:-"text"} export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal" # If you want to use multimodal, set MODALITY to "multimodal"
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
# Setup cleanup trap # Setup cleanup trap
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults # Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment