Unverified Commit e3cee95f authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

fix: Fix port collision and pass prefill param in kvbm connector (#4411)


Signed-off-by: default avatarkrishung5 <krish@nvidia.com>
parent 88dfd1b3
......@@ -325,10 +325,16 @@ def parse_args() -> Config:
def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
"""Create KVEventsConfig for prefix caching if needed."""
if config.is_decode_worker:
logger.info(
f"Decode worker detected (is_decode_worker={config.is_decode_worker}): "
f"kv_events_config disabled (decode workers don't publish KV events)"
)
return None
# If prefix caching is not enabled, no events config needed
if not config.engine_args.enable_prefix_caching or config.is_decode_worker:
logger.info("No kv_events_config required")
if not config.engine_args.enable_prefix_caching:
logger.info("No kv_events_config required: prefix caching is disabled")
return None
# There is a bug with KV events publishing when LORA is enabled.
......
......@@ -10,19 +10,11 @@ export PYTHONHASHSEED=0
# Common configuration
MODEL="Qwen/Qwen3-0.6B"
# run decode router with kv-overlap-score-weight 0 for pure load balancing
python -m dynamo.frontend \
--router-mode kv \
--http-port 8000 \
--kv-overlap-score-weight 0 \
--router-reset-states &
# run standalone router service for prefill workers
python -m dynamo.router \
--endpoint dynamo.prefill.generate \
--router-reset-states \
--no-track-active-blocks &
# two decode workers (without KVBM)
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
......@@ -30,6 +22,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--enforce-eager \
--is-decode-worker &
VLLM_NIXL_SIDE_CHANNEL_PORT=20096 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager \
......@@ -37,6 +30,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
# two prefill workers with KVBM enabled
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56001 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56002 \
CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
......@@ -44,10 +38,10 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
--model $MODEL \
--enforce-eager \
--is-prefill-worker \
--connector kvbm &
--connector kvbm nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
......@@ -55,4 +49,5 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
--model $MODEL \
--enforce-eager \
--is-prefill-worker \
--connector kvbm
--connector kvbm nixl \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment