Unverified Commit 7893f268 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: add --disaggregation-mode enum to vLLM backend (#6483)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 6d3e0137
...@@ -77,4 +77,5 @@ spec: ...@@ -77,4 +77,5 @@ spec:
- dynamo.vllm - dynamo.vllm
- --model - --model
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --is-prefill-worker - --disaggregation-mode
- prefill
...@@ -34,7 +34,8 @@ spec: ...@@ -34,7 +34,8 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --is-decode-worker - --disaggregation-mode
- decode
VllmPrefillWorker: VllmPrefillWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
...@@ -53,6 +54,7 @@ spec: ...@@ -53,6 +54,7 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --is-prefill-worker - --disaggregation-mode
- prefill
- --kv-events-config - --kv-events-config
- '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' - '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}'
...@@ -10,12 +10,12 @@ python -m dynamo.frontend & ...@@ -10,12 +10,12 @@ python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-decode-worker & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode decode &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -19,7 +19,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -19,7 +19,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=1 \ CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--is-prefill-worker \ --disaggregation-mode prefill \
--connector kvbm nixl \ --connector kvbm nixl \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -10,9 +10,9 @@ python -m dynamo.frontend --router-mode kv & ...@@ -10,9 +10,9 @@ python -m dynamo.frontend --router-mode kv &
# run decode workers on GPU 0 and 1, without enabling KVBM # run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker & CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --disaggregation-mode decode &
# run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache # run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
# NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts # NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
...@@ -22,7 +22,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -22,7 +22,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=2 \ CUDA_VISIBLE_DEVICES=2 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--is-prefill-worker \ --disaggregation-mode prefill \
--connector kvbm nixl \ --connector kvbm nixl \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}' &
...@@ -34,7 +34,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -34,7 +34,7 @@ DYN_KVBM_CPU_CACHE_GB=20 \
CUDA_VISIBLE_DEVICES=3 \ CUDA_VISIBLE_DEVICES=3 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--is-prefill-worker \ --disaggregation-mode prefill \
--connector kvbm nixl \ --connector kvbm nixl \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
...@@ -20,13 +20,13 @@ python -m dynamo.frontend \ ...@@ -20,13 +20,13 @@ python -m dynamo.frontend \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--enforce-eager \ --enforce-eager \
--is-decode-worker & --disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20096 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20096 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--enforce-eager \ --enforce-eager \
--is-decode-worker & --disaggregation-mode decode &
# two prefill workers with KVBM enabled # two prefill workers with KVBM enabled
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts # Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
...@@ -37,7 +37,7 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -37,7 +37,7 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --disaggregation-mode prefill \
--connector kvbm nixl \ --connector kvbm nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
...@@ -48,6 +48,6 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -48,6 +48,6 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --disaggregation-mode prefill \
--connector kvbm nixl \ --connector kvbm nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
...@@ -19,6 +19,6 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ ...@@ -19,6 +19,6 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 \ CUDA_VISIBLE_DEVICES=1 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--is-prefill-worker \ --disaggregation-mode prefill \
--connector lmcache nixl \ --connector lmcache nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -71,7 +71,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU py ...@@ -71,7 +71,7 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU py
# Start prefill worker (also handles encode routing via --route-to-encoder) # Start prefill worker (also handles encode routing via --route-to-encoder)
echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..." echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --route-to-encoder --is-prefill-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' & CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# Start decode worker # Start decode worker
echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..." echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."
......
...@@ -72,7 +72,7 @@ if [[ $HEAD_NODE -eq 1 ]]; then ...@@ -72,7 +72,7 @@ if [[ $HEAD_NODE -eq 1 ]]; then
python -m dynamo.vllm \ python -m dynamo.vllm \
--enable-multimodal \ --enable-multimodal \
--model $MODEL_NAME \ --model $MODEL_NAME \
--is-prefill-worker \ --disaggregation-mode prefill \
$MODEL_SPECIFIC_ARGS \ $MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \
"${EXTRA_ARGS[@]}" & "${EXTRA_ARGS[@]}" &
......
...@@ -24,24 +24,24 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -24,24 +24,24 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--is-decode-worker & --disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--is-decode-worker & --disaggregation-mode decode &
# two prefill workers # two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected # When registered with --disaggregation-mode prefill, these workers are automatically detected
# by the frontend, which activates an internal prefill router for KV-aware prefill routing # by the frontend, which activates an internal prefill router for KV-aware prefill routing
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}'& --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
...@@ -49,5 +49,5 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \ ...@@ -49,5 +49,5 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
...@@ -33,7 +33,7 @@ HABANA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -33,7 +33,7 @@ HABANA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \ --kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \ --connector none \
--is-decode-worker & --disaggregation-mode decode &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
...@@ -41,10 +41,10 @@ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ ...@@ -41,10 +41,10 @@ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \ --kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \ --connector none \
--is-decode-worker & --disaggregation-mode decode &
# two prefill workers # two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected # When registered with --disaggregation-mode prefill, these workers are automatically detected
# by the frontend, which activates an internal prefill router for KV-aware prefill routing # by the frontend, which activates an internal prefill router for KV-aware prefill routing
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
HABANA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \ HABANA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
...@@ -52,7 +52,7 @@ HABANA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \ ...@@ -52,7 +52,7 @@ HABANA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \ --kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \ --connector none \
--is-prefill-worker \ --disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558", "enable_kv_cache_events":true}'& --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558", "enable_kv_cache_events":true}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
...@@ -61,5 +61,5 @@ HABANA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \ ...@@ -61,5 +61,5 @@ HABANA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \ --kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \ --connector none \
--is-prefill-worker \ --disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559", "enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559", "enable_kv_cache_events":true}'
...@@ -55,7 +55,7 @@ CUDA_VISIBLE_DEVICES=0 \ ...@@ -55,7 +55,7 @@ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \ --enforce-eager \
--is-decode-worker \ --disaggregation-mode decode \
--gpu-memory-utilization ${GPU_MEM_FRACTION} & --gpu-memory-utilization ${GPU_MEM_FRACTION} &
DECODE_PID=$! DECODE_PID=$!
...@@ -75,7 +75,7 @@ CUDA_VISIBLE_DEVICES=0 \ ...@@ -75,7 +75,7 @@ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --disaggregation-mode prefill \
--gpu-memory-utilization ${GPU_MEM_FRACTION} \ --gpu-memory-utilization ${GPU_MEM_FRACTION} \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -82,7 +82,7 @@ Leave this terminal running - it will show Decode Worker logs. ...@@ -82,7 +82,7 @@ Leave this terminal running - it will show Decode Worker logs.
```bash ```bash
export DYN_LOG=debug # Increase log verbosity to see disaggregation export DYN_LOG=debug # Increase log verbosity to see disaggregation
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker \ CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --disaggregation-mode prefill \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
``` ```
......
...@@ -42,7 +42,7 @@ spec: ...@@ -42,7 +42,7 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- python3 -m dynamo.vllm --model meta-llama/Llama-3.1-70B-Instruct -tp 8 --is-decode-worker - python3 -m dynamo.vllm --model meta-llama/Llama-3.1-70B-Instruct -tp 8 --disaggregation-mode decode
VllmPrefillWorker: VllmPrefillWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
...@@ -69,4 +69,4 @@ spec: ...@@ -69,4 +69,4 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- python3 -m dynamo.vllm --model meta-llama/Llama-3.1-70B-Instruct -tp 4 --is-prefill-worker --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' - python3 -m dynamo.vllm --model meta-llama/Llama-3.1-70B-Instruct -tp 4 --disaggregation-mode prefill --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}'
...@@ -90,7 +90,7 @@ Add `sh,-c` in **Entry point** and `cd examples/backends/vllm && python -m dynam ...@@ -90,7 +90,7 @@ Add `sh,-c` in **Entry point** and `cd examples/backends/vllm && python -m dynam
Create the PrefillWorker task same as the frontend worker, except for following changes Create the PrefillWorker task same as the frontend worker, except for following changes
- Set container name as `dynamo-prefill` - Set container name as `dynamo-prefill`
- No container port mapping - No container port mapping
- Docker configuration with command `cd examples/backends/vllm && python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker` - Docker configuration with command `cd examples/backends/vllm && python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode prefill`
## 5. Task Deployment ## 5. Task Deployment
You can create a service or directly run the task from the task definition You can create a service or directly run the task from the task definition
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
"-c" "-c"
], ],
"command": [ "command": [
"python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker" "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode prefill"
], ],
"environment": [ "environment": [
{ {
......
...@@ -35,7 +35,7 @@ spec: ...@@ -35,7 +35,7 @@ spec:
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/nvidia/bin:/usr/local/nvidia/lib64 export PATH=$PATH:/usr/local/nvidia/bin:/usr/local/nvidia/lib64
/sbin/ldconfig /sbin/ldconfig
python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --disaggregation-mode decode
VllmPrefillWorker: VllmPrefillWorker:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
...@@ -56,4 +56,4 @@ spec: ...@@ -56,4 +56,4 @@ spec:
export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/nvidia/bin:/usr/local/nvidia/lib64 export PATH=$PATH:/usr/local/nvidia/bin:/usr/local/nvidia/lib64
/sbin/ldconfig /sbin/ldconfig
python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --disaggregation-mode prefill
...@@ -33,7 +33,7 @@ DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \ ...@@ -33,7 +33,7 @@ DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--endpoint dyn://prefill_pool_0.worker.generate \ --endpoint dyn://prefill_pool_0.worker.generate \
--is-prefill-worker \ --disaggregation-mode prefill \
--block-size 16 & --block-size 16 &
# ============================================================================ # ============================================================================
...@@ -47,7 +47,7 @@ DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \ ...@@ -47,7 +47,7 @@ DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--endpoint dyn://prefill_pool_1.worker.generate \ --endpoint dyn://prefill_pool_1.worker.generate \
--is-prefill-worker \ --disaggregation-mode prefill \
--block-size 16 & --block-size 16 &
# ============================================================================ # ============================================================================
...@@ -61,7 +61,7 @@ DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \ ...@@ -61,7 +61,7 @@ DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
--endpoint dyn://decode_pool_0.worker.generate \ --endpoint dyn://decode_pool_0.worker.generate \
--is-decode-worker \ --disaggregation-mode decode \
--block-size 16 & --block-size 16 &
# ============================================================================ # ============================================================================
......
...@@ -161,7 +161,8 @@ spec: ...@@ -161,7 +161,8 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --is-prefill-worker - --disaggregation-mode
- prefill
- --tensor-parallel-size - --tensor-parallel-size
- "1" - "1"
- --gpu-memory-utilization - --gpu-memory-utilization
...@@ -226,7 +227,8 @@ spec: ...@@ -226,7 +227,8 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-0.6B - Qwen/Qwen3-0.6B
- --is-prefill-worker - --disaggregation-mode
- prefill
- --tensor-parallel-size - --tensor-parallel-size
- "1" - "1"
- --gpu-memory-utilization - --gpu-memory-utilization
......
...@@ -142,7 +142,8 @@ spec: ...@@ -142,7 +142,8 @@ spec:
args: args:
- --model - --model
- /model-cache/deepseek-r1 - /model-cache/deepseek-r1
- --is-prefill-worker - --disaggregation-mode
- prefill
- --served-model-name - --served-model-name
- deepseek-ai/DeepSeek-R1 - deepseek-ai/DeepSeek-R1
- --all2all-backend - --all2all-backend
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment