Unverified Commit ecf6d48c authored by Alec's avatar Alec Committed by GitHub
Browse files

fix(vllm): port allocation bugs leading to zmq error (#4321)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
parent 58da7cfe
......@@ -36,7 +36,6 @@ class Config:
is_prefill_worker: bool
is_decode_worker: bool
migration_limit: int = 0
kv_port: Optional[int] = None
custom_jinja_template: Optional[str] = None
store_kv: str
......@@ -310,20 +309,12 @@ def parse_args() -> Config:
return config
async def configure_ports(config: Config):
"""Configure port settings from dedicated environment overrides."""
if config.engine_args.enable_prefix_caching:
config.kv_port = envs.DYN_VLLM_KV_EVENT_PORT
if config.has_connector("nixl"):
ensure_side_channel_host()
def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
"""Create KVEventsConfig for prefix caching if needed."""
# If prefix caching is not enabled, no events config needed
if not config.engine_args.enable_prefix_caching:
if not config.engine_args.enable_prefix_caching or config.is_decode_worker:
logger.info("No kv_events_config required")
return None
# There is a bug with KV events publishing when LORA is enabled.
......@@ -347,20 +338,19 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
# If user provided their own config, use that
if c := getattr(config.engine_args, "kv_events_config"):
logger.info(f"Using user-provided kv_events_config {c}")
return None
return c
# Create default events config for prefix caching
if config.kv_port is None:
raise ValueError(
"config.kv_port is not set; call configure_ports(...) before overwrite_args "
"or provide --kv-event-config to supply an explicit endpoint."
port = envs.DYN_VLLM_KV_EVENT_PORT
logger.info(
f"Using env-var DYN_VLLM_KV_EVENT_PORT={port} to create kv_events_config"
)
dp_rank = config.engine_args.data_parallel_rank or 0
return KVEventsConfig(
enable_kv_cache_events=True,
publisher="zmq",
endpoint=f"tcp://*:{config.kv_port - dp_rank}", # vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
endpoint=f"tcp://*:{port - dp_rank}", # vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
)
......@@ -416,6 +406,10 @@ def create_kv_transfer_config(config: Config) -> Optional[KVTransferConfig]:
def overwrite_args(config):
"""Set vLLM defaults for Dynamo."""
if config.has_connector("nixl"):
ensure_side_channel_host()
defaults = {
"task": "generate",
# As of vLLM >=0.10.0 the engine unconditionally calls
......@@ -431,14 +425,11 @@ def overwrite_args(config):
if kv_transfer_config:
defaults["kv_transfer_config"] = kv_transfer_config
kv_events_config = create_kv_events_config(config)
defaults["kv_events_config"] = create_kv_events_config(config)
logger.info(
f"Using Dynamo default kv_events_config for publishing kv events over zmq: {kv_events_config}"
f"Using kv_events_config for publishing vLLM kv events over zmq: {defaults['kv_events_config']}"
)
if kv_events_config:
defaults["kv_events_config"] = kv_events_config
logger.debug("Setting Dynamo defaults for vLLM")
for key, value in defaults.items():
if hasattr(config.engine_args, key):
......
......@@ -34,7 +34,7 @@ from dynamo.vllm.multimodal_handlers import (
ProcessorHandler,
)
from .args import ENABLE_LMCACHE, Config, configure_ports, overwrite_args, parse_args
from .args import ENABLE_LMCACHE, Config, overwrite_args, parse_args
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
from .publisher import StatLoggerFactory
......@@ -77,7 +77,6 @@ async def worker():
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, config.store_kv)
await configure_ports(config)
overwrite_args(config)
# Set up signal handler for graceful shutdown
......
......@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager \
--connector kvbm \
--gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' &
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
......@@ -38,4 +38,4 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager \
--connector kvbm \
--gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
......@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' &
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
......@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
......@@ -9,10 +9,10 @@ python -m dynamo.frontend --router-mode kv --http-port=8000 &
# run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager &
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker &
DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager &
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker &
# run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
# NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
......
......@@ -27,11 +27,13 @@ python -m dynamo.router \
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager &
--enforce-eager \
--is-decode-worker &
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--enforce-eager &
--enforce-eager \
--is-decode-worker &
# two prefill workers with KVBM enabled
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
......@@ -44,6 +46,8 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
--is-prefill-worker \
--connector kvbm &
DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
......
......@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' &
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' &
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected
......@@ -42,7 +42,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--enforce-eager \
--is-prefill-worker \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558"}'&
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
......@@ -50,4 +50,4 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--enforce-eager \
--is-prefill-worker \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559"}'
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083"}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment