fix: Fix port collision and pass prefill param in kvbm connector (#4411)

Signed-off-by: krishung5 <krish@nvidia.com>

fix: Fix port collision and pass prefill param in kvbm connector (#4411)
Signed-off-by: krishung5 <krish@nvidia.com>
e3cee95f · Kris Hung · GitHub · 88dfd1b3 · e3cee95f · e3cee95f
Unverified Commit e3cee95f authored Nov 17, 2025 by Kris Hung Committed by GitHub Nov 17, 2025
Showing with 15 additions and 14 deletions

components/src/dynamo/vllm/args.py components/src/dynamo/vllm/args.py +8 -2

examples/backends/vllm/launch/disagg_kvbm_router.sh examples/backends/vllm/launch/disagg_kvbm_router.sh +7 -12

No files found.
--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -325,10 +325,16 @@ def parse_args() -> Config:

 def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
    """Create KVEventsConfig for prefix caching if needed."""
+    if config.is_decode_worker:
+        logger.info(
+            f"Decode worker detected (is_decode_worker={config.is_decode_worker}): "
+            f"kv_events_config disabled (decode workers don't publish KV events)"
+        )
+        return None

    # If prefix caching is not enabled, no events config needed
-    if not config.engine_args.enable_prefix_caching or config.is_decode_worker:
-        logger.info("No kv_events_config required")
+    if not config.engine_args.enable_prefix_caching:
+        logger.info("No kv_events_config required: prefix caching is disabled")
        return None

    # There is a bug with KV events publishing when LORA is enabled.

--- a/examples/backends/vllm/launch/disagg_kvbm_router.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm_router.sh
@@ -10,19 +10,11 @@ export PYTHONHASHSEED=0
 # Common configuration
 MODEL="Qwen/Qwen3-0.6B"

-# run decode router with kv-overlap-score-weight 0 for pure load balancing
 python -m dynamo.frontend \
    --router-mode kv \
    --http-port 8000 \
-    --kv-overlap-score-weight 0 \
    --router-reset-states &

-# run standalone router service for prefill workers
-python -m dynamo.router \
-    --endpoint dynamo.prefill.generate \
-    --router-reset-states \
-    --no-track-active-blocks &
-
 # two decode workers (without KVBM)
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
 CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
@@ -30,6 +22,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
    --enforce-eager \
    --is-decode-worker &

+VLLM_NIXL_SIDE_CHANNEL_PORT=20096 \
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --model $MODEL \
    --enforce-eager \
@@ -37,6 +30,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \

 # two prefill workers with KVBM enabled
 # Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 DYN_KVBM_LEADER_ZMQ_PUB_PORT=56001 \
 DYN_KVBM_LEADER_ZMQ_ACK_PORT=56002 \
 CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
@@ -44,10 +38,10 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
    --model $MODEL \
    --enforce-eager \
    --is-prefill-worker \
-    --connector kvbm &
+    --connector kvbm nixl \
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &

-DYN_VLLM_KV_EVENT_PORT=20081 \
-VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
+VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
 DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
 DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
 CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
@@ -55,4 +49,5 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
    --model $MODEL \
    --enforce-eager \
    --is-prefill-worker \
-    --connector kvbm
+    --connector kvbm nixl \
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'