"vllm/vscode:/vscode.git/clone" did not exist on "2010f04c17e76c7d1f70f6e1c9d3857a93036114"
Unverified Commit 7893f268 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: add --disaggregation-mode enum to vLLM backend (#6483)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 6d3e0137
......@@ -77,4 +77,5 @@ spec:
- dynamo.vllm
- --model
- Qwen/Qwen3-0.6B
- --is-prefill-worker
- --disaggregation-mode
- prefill
......@@ -34,7 +34,8 @@ spec:
args:
- --model
- Qwen/Qwen3-0.6B
- --is-decode-worker
- --disaggregation-mode
- decode
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
......@@ -53,6 +54,7 @@ spec:
args:
- --model
- Qwen/Qwen3-0.6B
- --is-prefill-worker
- --disaggregation-mode
- prefill
- --kv-events-config
- '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}'
......@@ -10,12 +10,12 @@ python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-decode-worker &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode decode &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--is-prefill-worker \
--disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......@@ -19,7 +19,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--is-prefill-worker \
--disaggregation-mode prefill \
--connector kvbm nixl \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......@@ -10,9 +10,9 @@ python -m dynamo.frontend --router-mode kv &
# run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker &
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --disaggregation-mode decode &
# run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
# NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
......@@ -22,7 +22,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=2 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--is-prefill-worker \
--disaggregation-mode prefill \
--connector kvbm nixl \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}' &
......@@ -34,7 +34,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=3 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--is-prefill-worker \
--disaggregation-mode prefill \
--connector kvbm nixl \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
......@@ -20,13 +20,13 @@ python -m dynamo.frontend \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager \
--is-decode-worker &
--disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20096 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager \
--is-decode-worker &
--disaggregation-mode decode &
# two prefill workers with KVBM enabled
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
......@@ -37,7 +37,7 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager \
--is-prefill-worker \
--disaggregation-mode prefill \
--connector kvbm nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
......@@ -48,6 +48,6 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager \
--is-prefill-worker \
--disaggregation-mode prefill \
--connector kvbm nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
......@@ -19,6 +19,6 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--is-prefill-worker \
--disaggregation-mode prefill \
--connector lmcache nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......@@ -71,7 +71,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU py
# Start prefill worker (also handles encode routing via --route-to-encoder)
echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --route-to-encoder --is-prefill-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# Start decode worker
echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."
......
......@@ -72,7 +72,7 @@ if [[ $HEAD_NODE -eq 1 ]]; then
python -m dynamo.vllm \
--enable-multimodal \
--model $MODEL_NAME \
--is-prefill-worker \
--disaggregation-mode prefill \
$MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \
"${EXTRA_ARGS[@]}" &
......
......@@ -24,24 +24,24 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--is-decode-worker &
--disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--is-decode-worker &
--disaggregation-mode decode &
# two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected
# When registered with --disaggregation-mode prefill, these workers are automatically detected
# by the frontend, which activates an internal prefill router for KV-aware prefill routing
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--is-prefill-worker \
--disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
......@@ -49,5 +49,5 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--is-prefill-worker \
--disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
......@@ -33,7 +33,7 @@ HABANA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--is-decode-worker &
--disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
......@@ -41,10 +41,10 @@ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--is-decode-worker &
--disaggregation-mode decode &
# two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected
# When registered with --disaggregation-mode prefill, these workers are automatically detected
# by the frontend, which activates an internal prefill router for KV-aware prefill routing
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
HABANA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
......@@ -52,7 +52,7 @@ HABANA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--is-prefill-worker \
--disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558", "enable_kv_cache_events":true}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
......@@ -61,5 +61,5 @@ HABANA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--is-prefill-worker \
--disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559", "enable_kv_cache_events":true}'
......@@ -55,7 +55,7 @@ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--is-decode-worker \
--disaggregation-mode decode \
--gpu-memory-utilization ${GPU_MEM_FRACTION} &
DECODE_PID=$!
......@@ -75,7 +75,7 @@ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--is-prefill-worker \
--disaggregation-mode prefill \
--gpu-memory-utilization ${GPU_MEM_FRACTION} \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
......@@ -82,7 +82,7 @@ Leave this terminal running - it will show Decode Worker logs.
```bash
export DYN_LOG=debug # Increase log verbosity to see disaggregation
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker \
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
```
......
......@@ -42,7 +42,7 @@ spec:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model meta-llama/Llama-3.1-70B-Instruct -tp 8 --is-decode-worker
- python3 -m dynamo.vllm --model meta-llama/Llama-3.1-70B-Instruct -tp 8 --disaggregation-mode decode
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
......@@ -69,4 +69,4 @@ spec:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model meta-llama/Llama-3.1-70B-Instruct -tp 4 --is-prefill-worker --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}'
- python3 -m dynamo.vllm --model meta-llama/Llama-3.1-70B-Instruct -tp 4 --disaggregation-mode prefill --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}'
......@@ -90,7 +90,7 @@ Add `sh,-c` in **Entry point** and `cd examples/backends/vllm && python -m dynam
Create the PrefillWorker task same as the frontend worker, except for following changes
- Set container name as `dynamo-prefill`
- No container port mapping
- Docker configuration with command `cd examples/backends/vllm && python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker`
- Docker configuration with command `cd examples/backends/vllm && python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode prefill`
## 5. Task Deployment
You can create a service or directly run the task from the task definition
......
......@@ -15,7 +15,7 @@
"-c"
],
"command": [
"python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker"
"python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode prefill"
],
"environment": [
{
......
......@@ -35,7 +35,7 @@ spec:
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/nvidia/bin:/usr/local/nvidia/lib64
/sbin/ldconfig
python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --disaggregation-mode decode
VllmPrefillWorker:
envFromSecret: hf-token-secret
componentType: worker
......@@ -56,4 +56,4 @@ spec:
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/nvidia/bin:/usr/local/nvidia/lib64
/sbin/ldconfig
python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker
python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --disaggregation-mode prefill
......@@ -33,7 +33,7 @@ DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \
python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \
--endpoint dyn://prefill_pool_0.worker.generate \
--is-prefill-worker \
--disaggregation-mode prefill \
--block-size 16 &
# ============================================================================
......@@ -47,7 +47,7 @@ DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \
python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \
--endpoint dyn://prefill_pool_1.worker.generate \
--is-prefill-worker \
--disaggregation-mode prefill \
--block-size 16 &
# ============================================================================
......@@ -61,7 +61,7 @@ DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \
python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \
--endpoint dyn://decode_pool_0.worker.generate \
--is-decode-worker \
--disaggregation-mode decode \
--block-size 16 &
# ============================================================================
......
......@@ -161,7 +161,8 @@ spec:
args:
- --model
- Qwen/Qwen3-0.6B
- --is-prefill-worker
- --disaggregation-mode
- prefill
- --tensor-parallel-size
- "1"
- --gpu-memory-utilization
......@@ -226,7 +227,8 @@ spec:
args:
- --model
- Qwen/Qwen3-0.6B
- --is-prefill-worker
- --disaggregation-mode
- prefill
- --tensor-parallel-size
- "1"
- --gpu-memory-utilization
......
......@@ -142,7 +142,8 @@ spec:
args:
- --model
- /model-cache/deepseek-r1
- --is-prefill-worker
- --disaggregation-mode
- prefill
- --served-model-name
- deepseek-ai/DeepSeek-R1
- --all2all-backend
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment