fix(vllm): port allocation bugs leading to zmq error (#4321)

Signed-off-by: alec-flowers <aflowers@nvidia.com>

fix(vllm): port allocation bugs leading to zmq error (#4321)
Signed-off-by: alec-flowers <aflowers@nvidia.com>
ecf6d48c · Alec · GitHub · 58da7cfe · ecf6d48c · ecf6d48c
Unverified Commit ecf6d48c authored Nov 13, 2025 by Alec Committed by GitHub Nov 14, 2025
7 changed files
--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -36,7 +36,6 @@ class Config:
    is_prefill_worker: bool
    is_decode_worker: bool
    migration_limit: int = 0
-    kv_port: Optional[int] = None
    custom_jinja_template: Optional[str] = None
    store_kv: str

@@ -310,20 +309,12 @@ def parse_args() -> Config:
    return config


-async def configure_ports(config: Config):
-    """Configure port settings from dedicated environment overrides."""
-
-    if config.engine_args.enable_prefix_caching:
-        config.kv_port = envs.DYN_VLLM_KV_EVENT_PORT
-
-    if config.has_connector("nixl"):
-        ensure_side_channel_host()
-
-
 def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
    """Create KVEventsConfig for prefix caching if needed."""
+
    # If prefix caching is not enabled, no events config needed
-    if not config.engine_args.enable_prefix_caching:
+    if not config.engine_args.enable_prefix_caching or config.is_decode_worker:
+        logger.info("No kv_events_config required")
        return None

    # There is a bug with KV events publishing when LORA is enabled.
@@ -347,20 +338,19 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
    # If user provided their own config, use that
    if c := getattr(config.engine_args, "kv_events_config"):
        logger.info(f"Using user-provided kv_events_config {c}")
-        return None
+        return c

    # Create default events config for prefix caching
-    if config.kv_port is None:
-        raise ValueError(
-            "config.kv_port is not set; call configure_ports(...) before overwrite_args "
-            "or provide --kv-event-config to supply an explicit endpoint."
+    port = envs.DYN_VLLM_KV_EVENT_PORT
+    logger.info(
+        f"Using env-var DYN_VLLM_KV_EVENT_PORT={port} to create kv_events_config"
    )
    dp_rank = config.engine_args.data_parallel_rank or 0

    return KVEventsConfig(
        enable_kv_cache_events=True,
        publisher="zmq",
-        endpoint=f"tcp://*:{config.kv_port - dp_rank}",  # vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
+        endpoint=f"tcp://*:{port - dp_rank}",  # vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
    )


@@ -416,6 +406,10 @@ def create_kv_transfer_config(config: Config) -> Optional[KVTransferConfig]:

 def overwrite_args(config):
    """Set vLLM defaults for Dynamo."""
+
+    if config.has_connector("nixl"):
+        ensure_side_channel_host()
+
    defaults = {
        "task": "generate",
        # As of vLLM >=0.10.0 the engine unconditionally calls
@@ -431,14 +425,11 @@ def overwrite_args(config):
    if kv_transfer_config:
        defaults["kv_transfer_config"] = kv_transfer_config

-    kv_events_config = create_kv_events_config(config)
+    defaults["kv_events_config"] = create_kv_events_config(config)
    logger.info(
-        f"Using Dynamo default kv_events_config for publishing kv events over zmq: {kv_events_config}"
+        f"Using kv_events_config for publishing vLLM kv events over zmq: {defaults['kv_events_config']}"
    )

-    if kv_events_config:
-        defaults["kv_events_config"] = kv_events_config
-
    logger.debug("Setting Dynamo defaults for vLLM")
    for key, value in defaults.items():
        if hasattr(config.engine_args, key):

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -34,7 +34,7 @@ from dynamo.vllm.multimodal_handlers import (
    ProcessorHandler,
 )

-from .args import ENABLE_LMCACHE, Config, configure_ports, overwrite_args, parse_args
+from .args import ENABLE_LMCACHE, Config, overwrite_args, parse_args
 from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
 from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
 from .publisher import StatLoggerFactory
@@ -77,7 +77,6 @@ async def worker():
    loop = asyncio.get_running_loop()
    runtime = DistributedRuntime(loop, config.store_kv)

-    await configure_ports(config)
    overwrite_args(config)

    # Set up signal handler for graceful shutdown

--- a/examples/backends/vllm/launch/agg_kvbm_router.sh
+++ b/examples/backends/vllm/launch/agg_kvbm_router.sh
@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
    --enforce-eager \
    --connector kvbm \
    --gpu-memory-utilization 0.4 \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' &
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &

 DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
 DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
@@ -38,4 +38,4 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
    --enforce-eager \
    --connector kvbm \
    --gpu-memory-utilization 0.4 \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
--- a/examples/backends/vllm/launch/agg_router.sh
+++ b/examples/backends/vllm/launch/agg_router.sh
@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
    --block-size $BLOCK_SIZE \
    --enforce-eager \
    --connector none \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' &
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &

 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --block-size $BLOCK_SIZE \
    --enforce-eager \
    --connector none \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
--- a/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
@@ -9,10 +9,10 @@ python -m dynamo.frontend --router-mode kv --http-port=8000 &

 # run decode workers on GPU 0 and 1, without enabling KVBM
 # NOTE: remove --enforce-eager for production use
-CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager &
+CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker &
 DYN_VLLM_KV_EVENT_PORT=20081 \
 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
-CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager &
+CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker &

 # run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
 # NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts

--- a/examples/backends/vllm/launch/disagg_kvbm_router.sh
+++ b/examples/backends/vllm/launch/disagg_kvbm_router.sh
@@ -27,11 +27,13 @@ python -m dynamo.router \
 # --enforce-eager is added for quick deployment. for production use, need to remove this flag
 CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
    --model $MODEL \
-    --enforce-eager &
+    --enforce-eager \
+    --is-decode-worker &

 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --model $MODEL \
-    --enforce-eager &
+    --enforce-eager \
+    --is-decode-worker &

 # two prefill workers with KVBM enabled
 # Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
@@ -44,6 +46,8 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
    --is-prefill-worker \
    --connector kvbm &

+DYN_VLLM_KV_EVENT_PORT=20081 \
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
 DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
 CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \

--- a/examples/backends/vllm/launch/disagg_router.sh
+++ b/examples/backends/vllm/launch/disagg_router.sh
@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
    --model $MODEL \
    --block-size $BLOCK_SIZE \
    --enforce-eager \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' &
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}'&

 VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
 CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
    --model $MODEL \
    --block-size $BLOCK_SIZE \
    --enforce-eager \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' &
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &

 # two prefill workers
 # When registered with --is-prefill-worker, these workers are automatically detected
@@ -42,7 +42,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
    --block-size $BLOCK_SIZE \
    --enforce-eager \
    --is-prefill-worker \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558"}'&
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'&

 VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
 CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
@@ -50,4 +50,4 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
    --block-size $BLOCK_SIZE \
    --enforce-eager \
    --is-prefill-worker \
-    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559"}'
+    --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083"}'