Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
ecf6d48c
Unverified
Commit
ecf6d48c
authored
Nov 13, 2025
by
Alec
Committed by
GitHub
Nov 14, 2025
Browse files
fix(vllm): port allocation bugs leading to zmq error (#4321)
Signed-off-by:
alec-flowers
<
aflowers@nvidia.com
>
parent
58da7cfe
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
32 additions
and
38 deletions
+32
-38
components/src/dynamo/vllm/args.py
components/src/dynamo/vllm/args.py
+15
-24
components/src/dynamo/vllm/main.py
components/src/dynamo/vllm/main.py
+1
-2
examples/backends/vllm/launch/agg_kvbm_router.sh
examples/backends/vllm/launch/agg_kvbm_router.sh
+2
-2
examples/backends/vllm/launch/agg_router.sh
examples/backends/vllm/launch/agg_router.sh
+2
-2
examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
+2
-2
examples/backends/vllm/launch/disagg_kvbm_router.sh
examples/backends/vllm/launch/disagg_kvbm_router.sh
+6
-2
examples/backends/vllm/launch/disagg_router.sh
examples/backends/vllm/launch/disagg_router.sh
+4
-4
No files found.
components/src/dynamo/vllm/args.py
View file @
ecf6d48c
...
@@ -36,7 +36,6 @@ class Config:
...
@@ -36,7 +36,6 @@ class Config:
is_prefill_worker
:
bool
is_prefill_worker
:
bool
is_decode_worker
:
bool
is_decode_worker
:
bool
migration_limit
:
int
=
0
migration_limit
:
int
=
0
kv_port
:
Optional
[
int
]
=
None
custom_jinja_template
:
Optional
[
str
]
=
None
custom_jinja_template
:
Optional
[
str
]
=
None
store_kv
:
str
store_kv
:
str
...
@@ -310,20 +309,12 @@ def parse_args() -> Config:
...
@@ -310,20 +309,12 @@ def parse_args() -> Config:
return
config
return
config
async
def
configure_ports
(
config
:
Config
):
"""Configure port settings from dedicated environment overrides."""
if
config
.
engine_args
.
enable_prefix_caching
:
config
.
kv_port
=
envs
.
DYN_VLLM_KV_EVENT_PORT
if
config
.
has_connector
(
"nixl"
):
ensure_side_channel_host
()
def
create_kv_events_config
(
config
:
Config
)
->
Optional
[
KVEventsConfig
]:
def
create_kv_events_config
(
config
:
Config
)
->
Optional
[
KVEventsConfig
]:
"""Create KVEventsConfig for prefix caching if needed."""
"""Create KVEventsConfig for prefix caching if needed."""
# If prefix caching is not enabled, no events config needed
# If prefix caching is not enabled, no events config needed
if
not
config
.
engine_args
.
enable_prefix_caching
:
if
not
config
.
engine_args
.
enable_prefix_caching
or
config
.
is_decode_worker
:
logger
.
info
(
"No kv_events_config required"
)
return
None
return
None
# There is a bug with KV events publishing when LORA is enabled.
# There is a bug with KV events publishing when LORA is enabled.
...
@@ -347,20 +338,19 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
...
@@ -347,20 +338,19 @@ def create_kv_events_config(config: Config) -> Optional[KVEventsConfig]:
# If user provided their own config, use that
# If user provided their own config, use that
if
c
:
=
getattr
(
config
.
engine_args
,
"kv_events_config"
):
if
c
:
=
getattr
(
config
.
engine_args
,
"kv_events_config"
):
logger
.
info
(
f
"Using user-provided kv_events_config
{
c
}
"
)
logger
.
info
(
f
"Using user-provided kv_events_config
{
c
}
"
)
return
None
return
c
# Create default events config for prefix caching
# Create default events config for prefix caching
if
config
.
kv_port
is
None
:
port
=
envs
.
DYN_VLLM_KV_EVENT_PORT
raise
ValueError
(
logger
.
info
(
"config.kv_port is not set; call configure_ports(...) before overwrite_args "
f
"Using env-var DYN_VLLM_KV_EVENT_PORT=
{
port
}
to create kv_events_config"
"or provide --kv-event-config to supply an explicit endpoint."
)
)
dp_rank
=
config
.
engine_args
.
data_parallel_rank
or
0
dp_rank
=
config
.
engine_args
.
data_parallel_rank
or
0
return
KVEventsConfig
(
return
KVEventsConfig
(
enable_kv_cache_events
=
True
,
enable_kv_cache_events
=
True
,
publisher
=
"zmq"
,
publisher
=
"zmq"
,
endpoint
=
f
"tcp://*:
{
config
.
kv_
port
-
dp_rank
}
"
,
# vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
endpoint
=
f
"tcp://*:
{
port
-
dp_rank
}
"
,
# vLLM will iterate dp_rank for us, so we need to subtract it out TODO: fix in vLLM
)
)
...
@@ -416,6 +406,10 @@ def create_kv_transfer_config(config: Config) -> Optional[KVTransferConfig]:
...
@@ -416,6 +406,10 @@ def create_kv_transfer_config(config: Config) -> Optional[KVTransferConfig]:
def
overwrite_args
(
config
):
def
overwrite_args
(
config
):
"""Set vLLM defaults for Dynamo."""
"""Set vLLM defaults for Dynamo."""
if
config
.
has_connector
(
"nixl"
):
ensure_side_channel_host
()
defaults
=
{
defaults
=
{
"task"
:
"generate"
,
"task"
:
"generate"
,
# As of vLLM >=0.10.0 the engine unconditionally calls
# As of vLLM >=0.10.0 the engine unconditionally calls
...
@@ -431,14 +425,11 @@ def overwrite_args(config):
...
@@ -431,14 +425,11 @@ def overwrite_args(config):
if
kv_transfer_config
:
if
kv_transfer_config
:
defaults
[
"kv_transfer_config"
]
=
kv_transfer_config
defaults
[
"kv_transfer_config"
]
=
kv_transfer_config
kv_events_config
=
create_kv_events_config
(
config
)
defaults
[
"
kv_events_config
"
]
=
create_kv_events_config
(
config
)
logger
.
info
(
logger
.
info
(
f
"Using
Dynamo default
kv_events_config for publishing kv events over zmq:
{
kv_events_config
}
"
f
"Using kv_events_config for publishing
vLLM
kv events over zmq:
{
defaults
[
'
kv_events_config
'
]
}
"
)
)
if
kv_events_config
:
defaults
[
"kv_events_config"
]
=
kv_events_config
logger
.
debug
(
"Setting Dynamo defaults for vLLM"
)
logger
.
debug
(
"Setting Dynamo defaults for vLLM"
)
for
key
,
value
in
defaults
.
items
():
for
key
,
value
in
defaults
.
items
():
if
hasattr
(
config
.
engine_args
,
key
):
if
hasattr
(
config
.
engine_args
,
key
):
...
...
components/src/dynamo/vllm/main.py
View file @
ecf6d48c
...
@@ -34,7 +34,7 @@ from dynamo.vllm.multimodal_handlers import (
...
@@ -34,7 +34,7 @@ from dynamo.vllm.multimodal_handlers import (
ProcessorHandler
,
ProcessorHandler
,
)
)
from
.args
import
ENABLE_LMCACHE
,
Config
,
configure_ports
,
overwrite_args
,
parse_args
from
.args
import
ENABLE_LMCACHE
,
Config
,
overwrite_args
,
parse_args
from
.handlers
import
DecodeWorkerHandler
,
PrefillWorkerHandler
from
.handlers
import
DecodeWorkerHandler
,
PrefillWorkerHandler
from
.health_check
import
VllmHealthCheckPayload
,
VllmPrefillHealthCheckPayload
from
.health_check
import
VllmHealthCheckPayload
,
VllmPrefillHealthCheckPayload
from
.publisher
import
StatLoggerFactory
from
.publisher
import
StatLoggerFactory
...
@@ -77,7 +77,6 @@ async def worker():
...
@@ -77,7 +77,6 @@ async def worker():
loop
=
asyncio
.
get_running_loop
()
loop
=
asyncio
.
get_running_loop
()
runtime
=
DistributedRuntime
(
loop
,
config
.
store_kv
)
runtime
=
DistributedRuntime
(
loop
,
config
.
store_kv
)
await
configure_ports
(
config
)
overwrite_args
(
config
)
overwrite_args
(
config
)
# Set up signal handler for graceful shutdown
# Set up signal handler for graceful shutdown
...
...
examples/backends/vllm/launch/agg_kvbm_router.sh
View file @
ecf6d48c
...
@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
...
@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager
\
--enforce-eager
\
--connector
kvbm
\
--connector
kvbm
\
--gpu-memory-utilization
0.4
\
--gpu-memory-utilization
0.4
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
5556
"}'
&
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
20080
"}'
&
DYN_KVBM_LEADER_ZMQ_PUB_PORT
=
56003
\
DYN_KVBM_LEADER_ZMQ_PUB_PORT
=
56003
\
DYN_KVBM_LEADER_ZMQ_ACK_PORT
=
56004
\
DYN_KVBM_LEADER_ZMQ_ACK_PORT
=
56004
\
...
@@ -38,4 +38,4 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
...
@@ -38,4 +38,4 @@ CUDA_VISIBLE_DEVICES=0 DYN_KVBM_CPU_CACHE_GB=2 \
--enforce-eager
\
--enforce-eager
\
--connector
kvbm
\
--connector
kvbm
\
--gpu-memory-utilization
0.4
\
--gpu-memory-utilization
0.4
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
5557
"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
20081
"}'
examples/backends/vllm/launch/agg_router.sh
View file @
ecf6d48c
...
@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
...
@@ -24,7 +24,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--block-size
$BLOCK_SIZE
\
--block-size
$BLOCK_SIZE
\
--enforce-eager
\
--enforce-eager
\
--connector
none
\
--connector
none
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
5556
"}'
&
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
20080
"}'
&
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
\
...
@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
...
@@ -32,4 +32,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size
$BLOCK_SIZE
\
--block-size
$BLOCK_SIZE
\
--enforce-eager
\
--enforce-eager
\
--connector
none
\
--connector
none
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
5557
"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
20081
"}'
examples/backends/vllm/launch/disagg_kvbm_2p2d.sh
View file @
ecf6d48c
...
@@ -9,10 +9,10 @@ python -m dynamo.frontend --router-mode kv --http-port=8000 &
...
@@ -9,10 +9,10 @@ python -m dynamo.frontend --router-mode kv --http-port=8000 &
# run decode workers on GPU 0 and 1, without enabling KVBM
# run decode workers on GPU 0 and 1, without enabling KVBM
# NOTE: remove --enforce-eager for production use
# NOTE: remove --enforce-eager for production use
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.vllm
--model
Qwen/Qwen3-0.6B
--connector
nixl
--enforce-eager
&
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.vllm
--model
Qwen/Qwen3-0.6B
--connector
nixl
--enforce-eager
--is-decode-worker
&
DYN_VLLM_KV_EVENT_PORT
=
20081
\
DYN_VLLM_KV_EVENT_PORT
=
20081
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
--model
Qwen/Qwen3-0.6B
--connector
nixl
--enforce-eager
&
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
--model
Qwen/Qwen3-0.6B
--connector
nixl
--enforce-eager
--is-decode-worker
&
# run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
# run prefill workers on GPU 2 and 3 with KVBM enabled using 20GB of CPU cache
# NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
# NOTE: use different barrier id prefixes for each prefill worker to avoid conflicts
...
...
examples/backends/vllm/launch/disagg_kvbm_router.sh
View file @
ecf6d48c
...
@@ -27,11 +27,13 @@ python -m dynamo.router \
...
@@ -27,11 +27,13 @@ python -m dynamo.router \
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.vllm
\
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.vllm
\
--model
$MODEL
\
--model
$MODEL
\
--enforce-eager
&
--enforce-eager
\
--is-decode-worker
&
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
\
--model
$MODEL
\
--model
$MODEL
\
--enforce-eager
&
--enforce-eager
\
--is-decode-worker
&
# two prefill workers with KVBM enabled
# two prefill workers with KVBM enabled
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
...
@@ -44,6 +46,8 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
...
@@ -44,6 +46,8 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
--is-prefill-worker
\
--is-prefill-worker
\
--connector
kvbm &
--connector
kvbm &
DYN_VLLM_KV_EVENT_PORT
=
20081
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
DYN_KVBM_LEADER_ZMQ_PUB_PORT
=
56003
\
DYN_KVBM_LEADER_ZMQ_PUB_PORT
=
56003
\
DYN_KVBM_LEADER_ZMQ_ACK_PORT
=
56004
\
DYN_KVBM_LEADER_ZMQ_ACK_PORT
=
56004
\
CUDA_VISIBLE_DEVICES
=
3
DYN_KVBM_CPU_CACHE_GB
=
20
\
CUDA_VISIBLE_DEVICES
=
3
DYN_KVBM_CPU_CACHE_GB
=
20
\
...
...
examples/backends/vllm/launch/disagg_router.sh
View file @
ecf6d48c
...
@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
...
@@ -24,14 +24,14 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model
$MODEL
\
--model
$MODEL
\
--block-size
$BLOCK_SIZE
\
--block-size
$BLOCK_SIZE
\
--enforce-eager
\
--enforce-eager
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
5556
"}'
&
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
20080
"}'
&
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
\
--model
$MODEL
\
--model
$MODEL
\
--block-size
$BLOCK_SIZE
\
--block-size
$BLOCK_SIZE
\
--enforce-eager
\
--enforce-eager
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
5557
"}'
&
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
20081
"}'
&
# two prefill workers
# two prefill workers
# When registered with --is-prefill-worker, these workers are automatically detected
# When registered with --is-prefill-worker, these workers are automatically detected
...
@@ -42,7 +42,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
...
@@ -42,7 +42,7 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.vllm \
--block-size
$BLOCK_SIZE
\
--block-size
$BLOCK_SIZE
\
--enforce-eager
\
--enforce-eager
\
--is-prefill-worker
\
--is-prefill-worker
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
5558
"}'
&
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
20082
"}'
&
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20099
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20099
\
CUDA_VISIBLE_DEVICES
=
3 python3
-m
dynamo.vllm
\
CUDA_VISIBLE_DEVICES
=
3 python3
-m
dynamo.vllm
\
...
@@ -50,4 +50,4 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
...
@@ -50,4 +50,4 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.vllm \
--block-size
$BLOCK_SIZE
\
--block-size
$BLOCK_SIZE
\
--enforce-eager
\
--enforce-eager
\
--is-prefill-worker
\
--is-prefill-worker
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
5559
"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:
20083
"}'
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment