Unverified Commit 3b9b3f31 authored by Sean SH Choi's avatar Sean SH Choi Committed by GitHub
Browse files

fix: Add --is-decode-worker flag to vllm examples (#5899)


Signed-off-by: default avatarSean Choi <sechoi@nvidia.com>
parent 6720dfb6
......@@ -10,7 +10,7 @@ python -m dynamo.frontend &
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-decode-worker &
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
DYN_VLLM_KV_EVENT_PORT=20081 \
......
......@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}'&
--is-decode-worker &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}' &
--is-decode-worker &
# two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected
......
......@@ -33,7 +33,7 @@ HABANA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556", "enable_kv_cache_events":true}' &
--is-decode-worker &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
......@@ -41,7 +41,7 @@ HABANA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--kv-transfer-config "{\"kv_connector\": \"NixlConnector\", \"kv_role\": \"kv_both\", \"kv_buffer_device\": \"${NIXL_BUFFER_DEVICE}\", \"kv_connector_extra_config\": {\"backends\": [\"${VLLM_NIXL_BACKEND}\"]}}" \
--connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557", "enable_kv_cache_events":true}' &
--is-decode-worker &
# two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected
......
......@@ -55,6 +55,7 @@ CUDA_VISIBLE_DEVICES=0 \
python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--is-decode-worker \
--gpu-memory-utilization ${GPU_MEM_FRACTION} &
DECODE_PID=$!
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment