Unverified Commit 3b9b3f31 authored by Sean SH Choi's avatar Sean SH Choi Committed by GitHub
Browse files

fix: Add --is-decode-worker flag to vllm examples (#5899)


Signed-off-by: default avatarSean Choi <sechoi@nvidia.com>
parent 6720dfb6
...@@ -10,7 +10,7 @@ python -m dynamo.frontend & ...@@ -10,7 +10,7 @@ python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-decode-worker &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \ DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
DYN_VLLM_KV_EVENT_PORT=20081 \ DYN_VLLM_KV_EVENT_PORT=20081 \
......
...@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}'& --is-decode-worker &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' & --is-decode-worker &
# two prefill workers # two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected # When registered with --is-prefill-worker, these workers are automatically detected
......
...@@ -33,7 +33,7 @@ HABANA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -33,7 +33,7 @@ HABANA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \ --kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \ --connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556", "enable_kv_cache_events":true}' & --is-decode-worker &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
...@@ -41,7 +41,7 @@ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ ...@@ -41,7 +41,7 @@ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \ --kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \ --connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557", "enable_kv_cache_events":true}' & --is-decode-worker &
# two prefill workers # two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected # When registered with --is-prefill-worker, these workers are automatically detected
......
...@@ -55,6 +55,7 @@ CUDA_VISIBLE_DEVICES=0 \ ...@@ -55,6 +55,7 @@ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \ python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \ --model Qwen/Qwen3-0.6B \
--enforce-eager \ --enforce-eager \
--is-decode-worker \
--gpu-memory-utilization ${GPU_MEM_FRACTION} & --gpu-memory-utilization ${GPU_MEM_FRACTION} &
DECODE_PID=$! DECODE_PID=$!
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment