Unverified Commit e3cee95f authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

fix: Fix port collision and pass prefill param in kvbm connector (#4411)


Signed-off-by: default avatarkrishung5 <krish@nvidia.com>
parent 88dfd1b3
...@@ -325,10 +325,16 @@ def parse_args() -> Config: ...@@ -325,10 +325,16 @@ def parse_args() -> Config:
def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]: def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
"""Create KVEventsConfig for prefix caching if needed.""" """Create KVEventsConfig for prefix caching if needed."""
if config.is_decode_worker:
logger.info(
f"Decode worker detected (is_decode_worker={config.is_decode_worker}): "
f"kv_events_config disabled (decode workers don't publish KV events)"
)
return None
# If prefix caching is not enabled, no events config needed # If prefix caching is not enabled, no events config needed
if not config.engine_args.enable_prefix_caching or config.is_decode_worker: if not config.engine_args.enable_prefix_caching:
logger.info("No kv_events_config required") logger.info("No kv_events_config required: prefix caching is disabled")
return None return None
# There is a bug with KV events publishing when LORA is enabled. # There is a bug with KV events publishing when LORA is enabled.
......
...@@ -10,19 +10,11 @@ export PYTHONHASHSEED=0 ...@@ -10,19 +10,11 @@ export PYTHONHASHSEED=0
# Common configuration # Common configuration
MODEL="Qwen/Qwen3-0.6B" MODEL="Qwen/Qwen3-0.6B"
# run decode router with kv-overlap-score-weight 0 for pure load balancing
python -m dynamo.frontend \ python -m dynamo.frontend \
--router-mode kv \ --router-mode kv \
--http-port 8000 \ --http-port 8000 \
--kv-overlap-score-weight 0 \
--router-reset-states & --router-reset-states &
# run standalone router service for prefill workers
python -m dynamo.router \
--endpoint dynamo.prefill.generate \
--router-reset-states \
--no-track-active-blocks &
# two decode workers (without KVBM) # two decode workers (without KVBM)
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
...@@ -30,6 +22,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -30,6 +22,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--enforce-eager \ --enforce-eager \
--is-decode-worker & --is-decode-worker &
VLLM_NIXL_SIDE_CHANNEL_PORT=20096 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--enforce-eager \ --enforce-eager \
...@@ -37,6 +30,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ ...@@ -37,6 +30,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
# two prefill workers with KVBM enabled # two prefill workers with KVBM enabled
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts # Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56001 \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=56001 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56002 \ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56002 \
CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
...@@ -44,10 +38,10 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -44,10 +38,10 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
--model $MODEL \ --model $MODEL \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --is-prefill-worker \
--connector kvbm & --connector kvbm nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
DYN_VLLM_KV_EVENT_PORT=20081 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
...@@ -55,4 +49,5 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -55,4 +49,5 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
--model $MODEL \ --model $MODEL \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --is-prefill-worker \
--connector kvbm --connector kvbm nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment