Unverified Commit eac94322 authored by Alec's avatar Alec Committed by GitHub
Browse files

feat: add --kv-transfer-config NixlConnector to disagg scripts and recipes (#6560)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
parent 4bd4c165
...@@ -33,6 +33,8 @@ spec: ...@@ -33,6 +33,8 @@ spec:
- Qwen/Qwen3-8B - Qwen/Qwen3-8B
- --disaggregation-mode - --disaggregation-mode
- decode - decode
- --kv-transfer-config
- '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
......
...@@ -33,6 +33,8 @@ spec: ...@@ -33,6 +33,8 @@ spec:
- Qwen/Qwen3-8B - Qwen/Qwen3-8B
- --disaggregation-mode - --disaggregation-mode
- decode - decode
- --kv-transfer-config
- '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
......
...@@ -35,6 +35,8 @@ spec: ...@@ -35,6 +35,8 @@ spec:
- Qwen/Qwen3-8B - Qwen/Qwen3-8B
- --disaggregation-mode - --disaggregation-mode
- decode - decode
- --kv-transfer-config
- '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
- --gpu-memory-utilization - --gpu-memory-utilization
- "0.23" - "0.23"
- --max-model-len - --max-model-len
......
...@@ -10,7 +10,7 @@ python -m dynamo.frontend & ...@@ -10,7 +10,7 @@ python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode decode & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --disaggregation-mode decode --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
...@@ -18,4 +18,5 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ ...@@ -18,4 +18,5 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \ --enforce-eager \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -66,17 +66,17 @@ DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9} ...@@ -66,17 +66,17 @@ DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
# Start encode worker # Start encode worker
echo "Starting encode worker on GPU $DYN_ENCODE_WORKER_GPU (GPU mem: $DYN_ENCODE_GPU_MEM)..." echo "Starting encode worker on GPU $DYN_ENCODE_WORKER_GPU (GPU mem: $DYN_ENCODE_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME --gpu-memory-utilization $DYN_ENCODE_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' & VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME --gpu-memory-utilization $DYN_ENCODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
# Start prefill worker (also handles encode routing via --route-to-encoder) # Start prefill worker (also handles encode routing via --route-to-encoder)
echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..." echo "Starting prefill worker on GPU $DYN_PREFILL_WORKER_GPU (GPU mem: $DYN_PREFILL_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --multimodal-worker --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' & CUDA_VISIBLE_DEVICES=$DYN_PREFILL_WORKER_GPU python -m dynamo.vllm --multimodal-worker --route-to-encoder --disaggregation-mode prefill --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_PREFILL_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# Start decode worker # Start decode worker
echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..." echo "Starting decode worker on GPU $DYN_DECODE_WORKER_GPU (GPU mem: $DYN_DECODE_GPU_MEM)..."
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
CUDA_VISIBLE_DEVICES=$DYN_DECODE_WORKER_GPU python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_DECODE_GPU_MEM $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' & CUDA_VISIBLE_DEVICES=$DYN_DECODE_WORKER_GPU python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME --gpu-memory-utilization $DYN_DECODE_GPU_MEM $EXTRA_ARGS --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
echo "==================================================" echo "=================================================="
echo "All components started. Waiting for initialization..." echo "All components started. Waiting for initialization..."
......
...@@ -73,6 +73,7 @@ if [[ $HEAD_NODE -eq 1 ]]; then ...@@ -73,6 +73,7 @@ if [[ $HEAD_NODE -eq 1 ]]; then
--enable-multimodal \ --enable-multimodal \
--model $MODEL_NAME \ --model $MODEL_NAME \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
$MODEL_SPECIFIC_ARGS \ $MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \
"${EXTRA_ARGS[@]}" & "${EXTRA_ARGS[@]}" &
...@@ -84,6 +85,7 @@ else ...@@ -84,6 +85,7 @@ else
python -m dynamo.vllm \ python -m dynamo.vllm \
--enable-multimodal \ --enable-multimodal \
--model $MODEL_NAME \ --model $MODEL_NAME \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
$MODEL_SPECIFIC_ARGS \ $MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' \ --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' \
"${EXTRA_ARGS[@]}" & "${EXTRA_ARGS[@]}" &
......
...@@ -24,14 +24,16 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -24,14 +24,16 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--disaggregation-mode decode & --disaggregation-mode decode \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--disaggregation-mode decode & --disaggregation-mode decode \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
# two prefill workers # two prefill workers
# When registered with --disaggregation-mode prefill, these workers are automatically detected # When registered with --disaggregation-mode prefill, these workers are automatically detected
...@@ -42,6 +44,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \ ...@@ -42,6 +44,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}'& --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082","enable_kv_cache_events":true}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
...@@ -50,4 +53,5 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \ ...@@ -50,4 +53,5 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083","enable_kv_cache_events":true}'
...@@ -56,6 +56,7 @@ python3 -m dynamo.vllm \ ...@@ -56,6 +56,7 @@ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \ --enforce-eager \
--disaggregation-mode decode \ --disaggregation-mode decode \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization ${GPU_MEM_FRACTION} & --gpu-memory-utilization ${GPU_MEM_FRACTION} &
DECODE_PID=$! DECODE_PID=$!
...@@ -76,6 +77,7 @@ python3 -m dynamo.vllm \ ...@@ -76,6 +77,7 @@ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \ --enforce-eager \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--gpu-memory-utilization ${GPU_MEM_FRACTION} \ --gpu-memory-utilization ${GPU_MEM_FRACTION} \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
...@@ -97,6 +97,8 @@ spec: ...@@ -97,6 +97,8 @@ spec:
- "512" - "512"
- --compilation_config - --compilation_config
- '{"pass_config":{"fuse_norm_quant":true,"eliminate_noops":true},"cudagraph_mode":"FULL_DECODE_ONLY"}' - '{"pass_config":{"fuse_norm_quant":true,"eliminate_noops":true},"cudagraph_mode":"FULL_DECODE_ONLY"}'
- --kv-transfer-config
- '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
prefill: prefill:
componentType: worker componentType: worker
subComponentType: prefill subComponentType: prefill
...@@ -144,6 +146,8 @@ spec: ...@@ -144,6 +146,8 @@ spec:
- /model-cache/deepseek-r1 - /model-cache/deepseek-r1
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
- --kv-transfer-config
- '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
- --served-model-name - --served-model-name
- deepseek-ai/DeepSeek-R1 - deepseek-ai/DeepSeek-R1
- --all2all-backend - --all2all-backend
......
...@@ -42,7 +42,7 @@ spec: ...@@ -42,7 +42,7 @@ spec:
- name: HF_HOME - name: HF_HOME
value: /opt/models value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disaggregation-mode prefill --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disaggregation-mode prefill --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
...@@ -73,7 +73,7 @@ spec: ...@@ -73,7 +73,7 @@ spec:
- name: HF_HOME - name: HF_HOME
value: /opt/models value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
......
...@@ -54,7 +54,7 @@ spec: ...@@ -54,7 +54,7 @@ spec:
- name: HF_HOME - name: HF_HOME
value: /opt/models value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disaggregation-mode prefill --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disaggregation-mode prefill --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
...@@ -97,7 +97,7 @@ spec: ...@@ -97,7 +97,7 @@ spec:
- name: HF_HOME - name: HF_HOME
value: /opt/models value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
- /bin/sh - /bin/sh
- -c - -c
......
...@@ -45,6 +45,8 @@ spec: ...@@ -45,6 +45,8 @@ spec:
- Qwen/Qwen3-32B - Qwen/Qwen3-32B
- --disaggregation-mode - --disaggregation-mode
- decode - decode
- --kv-transfer-config
- '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
- --tensor-parallel-size - --tensor-parallel-size
- '2' - '2'
- --disable-log-requests - --disable-log-requests
...@@ -99,6 +101,8 @@ spec: ...@@ -99,6 +101,8 @@ spec:
- Qwen/Qwen3-32B - Qwen/Qwen3-32B
- --disaggregation-mode - --disaggregation-mode
- prefill - prefill
- --kv-transfer-config
- '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
- --tensor-parallel-size - --tensor-parallel-size
- '2' - '2'
- --disable-log-requests - --disable-log-requests
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment