Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
e3cee95f
Unverified
Commit
e3cee95f
authored
Nov 17, 2025
by
Kris Hung
Committed by
GitHub
Nov 17, 2025
Browse files
fix: Fix port collision and pass prefill param in kvbm connector (#4411)
Signed-off-by:
krishung5
<
krish@nvidia.com
>
parent
88dfd1b3
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
14 deletions
+15
-14
components/src/dynamo/vllm/args.py
components/src/dynamo/vllm/args.py
+8
-2
examples/backends/vllm/launch/disagg_kvbm_router.sh
examples/backends/vllm/launch/disagg_kvbm_router.sh
+7
-12
No files found.
components/src/dynamo/vllm/args.py
View file @
e3cee95f
...
@@ -325,10 +325,16 @@ def parse_args() -> Config:
...
@@ -325,10 +325,16 @@ def parse_args() -> Config:
def
create_kv_events_config
(
config
:
Config
)
->
Optional
[
KVEventsConfig
]:
def
create_kv_events_config
(
config
:
Config
)
->
Optional
[
KVEventsConfig
]:
"""Create KVEventsConfig for prefix caching if needed."""
"""Create KVEventsConfig for prefix caching if needed."""
if
config
.
is_decode_worker
:
logger
.
info
(
f
"Decode worker detected (is_decode_worker=
{
config
.
is_decode_worker
}
): "
f
"kv_events_config disabled (decode workers don't publish KV events)"
)
return
None
# If prefix caching is not enabled, no events config needed
# If prefix caching is not enabled, no events config needed
if
not
config
.
engine_args
.
enable_prefix_caching
or
config
.
is_decode_worker
:
if
not
config
.
engine_args
.
enable_prefix_caching
:
logger
.
info
(
"No kv_events_config required"
)
logger
.
info
(
"No kv_events_config required
: prefix caching is disabled
"
)
return
None
return
None
# There is a bug with KV events publishing when LORA is enabled.
# There is a bug with KV events publishing when LORA is enabled.
...
...
examples/backends/vllm/launch/disagg_kvbm_router.sh
View file @
e3cee95f
...
@@ -10,19 +10,11 @@ export PYTHONHASHSEED=0
...
@@ -10,19 +10,11 @@ export PYTHONHASHSEED=0
# Common configuration
# Common configuration
MODEL
=
"Qwen/Qwen3-0.6B"
MODEL
=
"Qwen/Qwen3-0.6B"
# run decode router with kv-overlap-score-weight 0 for pure load balancing
python
-m
dynamo.frontend
\
python
-m
dynamo.frontend
\
--router-mode
kv
\
--router-mode
kv
\
--http-port
8000
\
--http-port
8000
\
--kv-overlap-score-weight
0
\
--router-reset-states
&
--router-reset-states
&
# run standalone router service for prefill workers
python
-m
dynamo.router
\
--endpoint
dynamo.prefill.generate
\
--router-reset-states
\
--no-track-active-blocks
&
# two decode workers (without KVBM)
# two decode workers (without KVBM)
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
# --enforce-eager is added for quick deployment. for production use, need to remove this flag
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.vllm
\
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.vllm
\
...
@@ -30,6 +22,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
...
@@ -30,6 +22,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--enforce-eager
\
--enforce-eager
\
--is-decode-worker
&
--is-decode-worker
&
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20096
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.vllm
\
--model
$MODEL
\
--model
$MODEL
\
--enforce-eager
\
--enforce-eager
\
...
@@ -37,6 +30,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
...
@@ -37,6 +30,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
# two prefill workers with KVBM enabled
# two prefill workers with KVBM enabled
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
# Each worker needs unique ZMQ ports to avoid KVBM coordination conflicts
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
DYN_KVBM_LEADER_ZMQ_PUB_PORT
=
56001
\
DYN_KVBM_LEADER_ZMQ_PUB_PORT
=
56001
\
DYN_KVBM_LEADER_ZMQ_ACK_PORT
=
56002
\
DYN_KVBM_LEADER_ZMQ_ACK_PORT
=
56002
\
CUDA_VISIBLE_DEVICES
=
2
DYN_KVBM_CPU_CACHE_GB
=
20
\
CUDA_VISIBLE_DEVICES
=
2
DYN_KVBM_CPU_CACHE_GB
=
20
\
...
@@ -44,10 +38,10 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
...
@@ -44,10 +38,10 @@ CUDA_VISIBLE_DEVICES=2 DYN_KVBM_CPU_CACHE_GB=20 \
--model
$MODEL
\
--model
$MODEL
\
--enforce-eager
\
--enforce-eager
\
--is-prefill-worker
\
--is-prefill-worker
\
--connector
kvbm &
--connector
kvbm nixl
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
&
DYN_VLLM_KV_EVENT_PORT
=
20081
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20098
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
DYN_KVBM_LEADER_ZMQ_PUB_PORT
=
56003
\
DYN_KVBM_LEADER_ZMQ_PUB_PORT
=
56003
\
DYN_KVBM_LEADER_ZMQ_ACK_PORT
=
56004
\
DYN_KVBM_LEADER_ZMQ_ACK_PORT
=
56004
\
CUDA_VISIBLE_DEVICES
=
3
DYN_KVBM_CPU_CACHE_GB
=
20
\
CUDA_VISIBLE_DEVICES
=
3
DYN_KVBM_CPU_CACHE_GB
=
20
\
...
@@ -55,4 +49,5 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
...
@@ -55,4 +49,5 @@ CUDA_VISIBLE_DEVICES=3 DYN_KVBM_CPU_CACHE_GB=20 \
--model
$MODEL
\
--model
$MODEL
\
--enforce-eager
\
--enforce-eager
\
--is-prefill-worker
\
--is-prefill-worker
\
--connector
kvbm
--connector
kvbm nixl
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment