Unverified Commit ecf6d48c authored by Alec's avatar Alec Committed by GitHub
Browse files

fix(vllm): port allocation bugs leading to zmq error (#4321)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
parent 58da7cfe
...@@ -36,7 +36,6 @@ class Config: ...@@ -36,7 +36,6 @@ class Config:
is_prefill_worker: bool is_prefill_worker: bool
is_decode_worker: bool is_decode_worker: bool
migration_limit: int = 0 migration_limit: int = 0
kv_port: Optional[int] = None
custom_jinja_template: Optional[str] = None custom_jinja_template: Optional[str] = None
store_kv: str store_kv: str
...@@ -310,20 +309,12 @@ def parse_args() -> Config: ...@@ -310,20 +309,12 @@ def parse_args() -> Config:
return config return config
async def configure_ports(config: Config):
"""Configure port settings from dedicated environment overrides."""
if config.engine_args.enable_prefix_caching:
config.kv_port = envs.DYN_VLLM_KV_EVENT_PORT
if config.has_connector("nixl"):
ensure_side_channel_host()
def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]: def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
"""Create KVEventsConfig for prefix caching if needed.""" """Create KVEventsConfig for prefix caching if needed."""
# If prefix caching is not enabled, no events config needed # If prefix caching is not enabled, no events config needed
if not config.engine_args.enable_prefix_caching: if not config.engine_args.enable_prefix_caching or config.is_decode_worker:
logger.info("No kv_events_config required")
return None return None
# There is a bug with KV events publishing when LORA is enabled. # There is a bug with KV events publishing when LORA is enabled.
...@@ -347,20 +338,19 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]: ...@@ -347,20 +338,19 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
# If user provided their own config, use that # If user provided their own config, use that
if c := getattr(config.engine_args, "kv_events_config"): if c := getattr(config.engine_args, "kv_events_config"):
logger.info(f"Using user-provided kv_events_config {c}") logger.info(f"Using user-provided kv_events_config {c}")
return None return c
# Create default events config for prefix caching # Create default events config for prefix caching
if config.kv_port is None: port = envs.DYN_VLLM_KV_EVENT_PORT
raise ValueError( logger.info(
"config.kv_port is not set; call configure_ports(...) before overwrite_args " f"Using env-var DYN_VLLM_KV_EVENT_PORT={port} to create kv_events_config"
"or provide --kv-event-config to supply an explicit endpoint." )
)
dp_rank = config.engine_args.data_parallel_rank or 0 dp_rank = config.engine_args.data_parallel_rank or 0
return KVEventsConfig( return KVEventsConfig(
enable_kv_cache_events=True, enable_kv_cache_events=True,
publisher="zmq", publisher="zmq",
endpoint=f"tcp://*:{config.kv_port - dp_rank}", # vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM endpoint=f"tcp://*:{port - dp_rank}", # vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
) )
...@@ -416,6 +406,10 @@ def create_kv_transfer_config(config: Config) -> Optional[KVTransferConfig]: ...@@ -416,6 +406,10 @@ def create_kv_transfer_config(config: Config) -> Optional[KVTransferConfig]:
def overwrite_args(config): def overwrite_args(config):
"""Set vLLM defaults for Dynamo.""" """Set vLLM defaults for Dynamo."""
if config.has_connector("nixl"):
ensure_side_channel_host()
defaults = { defaults = {
"task": "generate", "task": "generate",
# As of vLLM >=0.10.0 the engine unconditionally calls # As of vLLM >=0.10.0 the engine unconditionally calls
...@@ -431,14 +425,11 @@ def overwrite_args(config): ...@@ -431,14 +425,11 @@ def overwrite_args(config):
if kv_transfer_config: if kv_transfer_config:
defaults["kv_transfer_config"] = kv_transfer_config defaults["kv_transfer_config"] = kv_transfer_config
kv_events_config = create_kv_events_config(config) defaults["kv_events_config"] = create_kv_events_config(config)
logger.info( logger.info(
f"Using Dynamo default kv_events_config for publishing kv events over zmq: {kv_events_config}" f"Using kv_events_config for publishing vLLM kv events over zmq: {defaults['kv_events_config']}"
) )
if kv_events_config:
defaults["kv_events_config"] = kv_events_config
logger.debug("Setting Dynamo defaults for vLLM") logger.debug("Setting Dynamo defaults for vLLM")
for key, value in defaults.items(): for key, value in defaults.items():
if hasattr(config.engine_args, key): if hasattr(config.engine_args, key):
......
...@@ -34,7 +34,7 @@ from dynamo.vllm.multimodal_handlers import ( ...@@ -34,7 +34,7 @@ from dynamo.vllm.multimodal_handlers import (
ProcessorHandler, ProcessorHandler,
) )
from .args import ENABLE_LMCACHE, Config, configure_ports, overwrite_args, parse_args from .args import ENABLE_LMCACHE, Config, overwrite_args, parse_args
from .handlers import DecodeWorkerHandler, PrefillWorkerHandler from .handlers import DecodeWorkerHandler, PrefillWorkerHandler
from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload from .health_check import VllmHealthCheckPayload, VllmPrefillHealthCheckPayload
from .publisher import StatLoggerFactory from .publisher import StatLoggerFactory
...@@ -77,7 +77,6 @@ async def worker(): ...@@ -77,7 +77,6 @@ async def worker():
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, config.store_kv) runtime = DistributedRuntime(loop, config.store_kv)
await configure_ports(config)
overwrite_args(config) overwrite_args(config)
# Set up signal handler for graceful shutdown # Set up signal handler for graceful shutdown
......
...@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \ ...@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager \ --enforce-eager \
--connector kvbm \ --connector kvbm \
--gpu-memory-utilization 0.4 \ --gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
...@@ -38,4 +38,4 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \ ...@@ -38,4 +38,4 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager \ --enforce-eager \
--connector kvbm \ --connector kvbm \
--gpu-memory-utilization 0.4 \ --gpu-memory-utilization 0.4 \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
...@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--connector none \ --connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
...@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ ...@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--connector none \ --connector none \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
...@@ -9,10 +9,10 @@ python -m dynamo.frontend --router-mode kv --http-port=8000 & ...@@ -9,10 +9,10 @@ python -m dynamo.frontend --router-mode kv --http-port=8000 &
# run decode workers on GPU 0 and 1, without enabling KVBM # run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use # NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager & CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker &
DYN_VLLM_KV_EVENT_PORT=20081 \ DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager & CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector nixl --enforce-eager --is-decode-worker &
# run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache # run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
# NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts # NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
......
...@@ -27,11 +27,13 @@ python -m dynamo.router \ ...@@ -27,11 +27,13 @@ python -m dynamo.router \
# --enforce-eager is added for quick deployment. for production use, need to remove this flag # --enforce-eager is added for quick deployment. for production use, need to remove this flag
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--enforce-eager & --enforce-eager \
--is-decode-worker &
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--enforce-eager & --enforce-eager \
--is-decode-worker &
# two prefill workers with KVBM enabled # two prefill workers with KVBM enabled
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts # Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
...@@ -44,6 +46,8 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \ ...@@ -44,6 +46,8 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
--is-prefill-worker \ --is-prefill-worker \
--connector kvbm & --connector kvbm &
DYN_VLLM_KV_EVENT_PORT=20081 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \ DYN_KVBM_LEADER_ZMQ_PUB_PORT=56003 \
DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \ DYN_KVBM_LEADER_ZMQ_ACK_PORT=56004 \
CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
......
...@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \ ...@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5556"}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model $MODEL \ --model $MODEL \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' & --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
# two prefill workers # two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected # When registered with --is-prefill-worker, these workers are automatically detected
...@@ -42,7 +42,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \ ...@@ -42,7 +42,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --is-prefill-worker \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558"}'& --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'&
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \ VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
...@@ -50,4 +50,4 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \ ...@@ -50,4 +50,4 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \ --block-size $BLOCK_SIZE \
--enforce-eager \ --enforce-eager \
--is-prefill-worker \ --is-prefill-worker \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559"}' --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20083"}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment