Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
c916cd42
Unverified
Commit
c916cd42
authored
Feb 24, 2026
by
atchernych
Committed by
GitHub
Feb 25, 2026
Browse files
feat: Support epp's "pods" interface in Dynamo fixes [DEP-424] (#6302)
Signed-off-by:
Anna Tchernych
<
atchernych@nvidia.com
>
parent
5a4c96db
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
403 additions
and
39 deletions
+403
-39
lib/llm/src/kv_router/prefill_router.rs
lib/llm/src/kv_router/prefill_router.rs
+5
-0
lib/llm/src/kv_router/push_router.rs
lib/llm/src/kv_router/push_router.rs
+1
-0
lib/llm/src/kv_router/scheduler.rs
lib/llm/src/kv_router/scheduler.rs
+15
-7
recipes/llama-3-70b/vllm/agg/gaie/deploy.yaml
recipes/llama-3-70b/vllm/agg/gaie/deploy.yaml
+82
-32
recipes/llama-3-70b/vllm/disagg-single-node/gaie/deploy.yaml
recipes/llama-3-70b/vllm/disagg-single-node/gaie/deploy.yaml
+258
-0
recipes/llama-3-70b/vllm/disagg-single-node/gaie/http-route.yaml
.../llama-3-70b/vllm/disagg-single-node/gaie/http-route.yaml
+42
-0
No files found.
lib/llm/src/kv_router/prefill_router.rs
View file @
c916cd42
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use
std
::
collections
::
HashSet
;
use
std
::
sync
::{
Arc
,
OnceLock
};
use
anyhow
::
Result
;
...
...
@@ -20,6 +21,7 @@ use dynamo_runtime::{
use
crate
::{
discovery
::
ModelManager
,
kv_router
::
protocols
::
WorkerId
,
kv_router
::{
KvPushRouter
,
KvRouterConfig
,
RouterConfigOverride
,
protocols
::
BlockExtraInfo
},
protocols
::
common
::
llm_backend
::{
LLMEngineOutput
,
PreprocessedRequest
},
protocols
::
common
::
preprocessor
::{
BootstrapInfo
,
PrefillResult
},
...
...
@@ -305,6 +307,7 @@ impl PrefillRouter {
false
,
lora_name
,
priority_jump
,
None
,
)
.await
{
...
...
@@ -505,6 +508,7 @@ impl PrefillRouter {
update_states
:
bool
,
lora_name
:
Option
<
String
>
,
priority_jump
:
f64
,
allowed_worker_ids
:
Option
<
HashSet
<
WorkerId
>>
,
)
->
Result
<
(
u64
,
u32
)
>
{
let
prefill_router
=
self
.prefill_router
...
...
@@ -523,6 +527,7 @@ impl PrefillRouter {
update_states
,
lora_name
,
priority_jump
,
allowed_worker_ids
,
)
.await
?
;
Ok
((
worker
.worker_id
,
worker
.dp_rank
))
...
...
lib/llm/src/kv_router/push_router.rs
View file @
c916cd42
...
...
@@ -217,6 +217,7 @@ impl KvPushRouter {
!
is_query_only
,
lora_name
,
priority_jump
,
None
,
)
.await
?
;
...
...
lib/llm/src/kv_router/scheduler.rs
View file @
c916cd42
...
...
@@ -63,6 +63,8 @@ pub struct SchedulingRequest {
pub
lora_name
:
Option
<
String
>
,
/// Priority jump in seconds; decreases effective arrival time in the queue.
pub
priority_jump
:
f64
,
/// Optional set of allowed worker IDs to restrict routing decisions (EPP).
pub
allowed_worker_ids
:
Option
<
HashSet
<
WorkerId
>>
,
resp_tx
:
Option
<
tokio
::
sync
::
oneshot
::
Sender
<
Result
<
SchedulingResponse
,
KvSchedulerError
>>>
,
}
...
...
@@ -204,7 +206,8 @@ impl KvScheduler {
update_states
:
bool
,
lora_name
:
Option
<
String
>
,
priority_jump
:
f64
,
)
->
Result
<
WorkerWithDpRank
,
KvSchedulerError
>
{
allowed_worker_ids
:
Option
<
HashSet
<
WorkerId
>>
,
)
->
Result
<
SchedulingResponse
,
KvSchedulerError
>
{
#[cfg(feature
=
"bench"
)]
let
start
=
Instant
::
now
();
...
...
@@ -220,6 +223,7 @@ impl KvScheduler {
update_states
,
lora_name
,
priority_jump
,
allowed_worker_ids
,
resp_tx
:
Some
(
resp_tx
),
};
...
...
@@ -245,7 +249,7 @@ impl KvScheduler {
"scheduler.schedule completed"
);
Ok
(
response
.best_worker
)
Ok
(
response
)
}
pub
async
fn
add_request
(
&
self
,
req
:
SequenceRequest
)
->
Result
<
(),
SequenceError
>
{
...
...
@@ -404,7 +408,11 @@ impl WorkerSelector for DefaultWorkerSelector {
)
->
Result
<
WorkerSelectionResult
,
KvSchedulerError
>
{
assert
!
(
request
.isl_tokens
>
0
);
if
workers
.is_empty
()
{
let
allowed_ids
=
request
.allowed_worker_ids
.as_ref
();
if
allowed_ids
.map_or
(
workers
.is_empty
(),
|
ids
|
{
!
workers
.keys
()
.any
(|
wid
|
ids
.contains
(
wid
))
})
{
return
Err
(
KvSchedulerError
::
NoEndpoints
);
}
...
...
@@ -424,10 +432,10 @@ impl WorkerSelector for DefaultWorkerSelector {
.and_then
(|
cfg
|
cfg
.overlap_score_weight
)
.unwrap_or
(
self
.kv_router_config.overlap_score_weight
);
// Calculate logits for each worker with dp_rank
// Outer loop: iterate over all workers from runtime config
// Inner loop: iterate over all dp_ranks for each worker
for
(
worker_id
,
config
)
in
workers
.iter
()
{
for
(
worker_id
,
config
)
in
workers
.iter
()
.filter
(|(
wid
,
_
)|
allowed_ids
.is_none_or
(|
ids
|
ids
.contains
(
wid
)))
{
let
data_parallel_size
=
config
.data_parallel_size
;
for
dp_rank
in
0
..
data_parallel_size
{
...
...
recipes/llama-3-70b/vllm/agg/gaie/deploy.yaml
View file @
c916cd42
...
...
@@ -18,46 +18,38 @@ spec:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/frontend:my-tag
eppConfig
:
# This configuration uses Dynamo's KV-aware scorer for intelligent routing
# This config uses the same disagg-profile-handler as disaggregated deployments.
# The handler's graceful degradation feature makes this possible:
# - With no "prefill" profile defined, it runs only the "decode" profile.
# - The decode scorer receives isDisaggregated=false, so the Dynamo KV router
# uses full overlap scoring (overlap_score_weight=1.0) for aggregated mode.
# - If prefill workers were added later (and a prefill profile configured),
# the same handler would automatically switch to disaggregated routing.
config
:
# Plugins define the behavior of EPP
plugins
:
# Required: tells EPP which profile to use (even if you only have one)
-
type
:
single-profile-handler
# Picker: chooses the final endpoint after scoring
-
type
:
disagg-profile-handler
-
name
:
decode-filter
type
:
label-filter
# allowsNoLabel: true lets pods without the subComponentType label pass through,
# which is typical for aggregated deployments where workers don't have this label.
parameters
:
label
:
"
nvidia.com/dynamo-sub-component-type"
validValues
:
-
"
decode"
allowsNoLabel
:
true
-
name
:
picker
type
:
max-score-picker
-
name
:
dyn-
kv
type
:
kv-awar
e-scorer
#
Scheduling profiles configure which plugi
ns
a
re
used and their weights
-
name
:
dyn-
decode
type
:
dyn-decod
e-scorer
#
Only a "decode" profile — no "prefill" profile mea
ns
pu
re
aggregated mode.
schedulingProfiles
:
-
name
:
de
fault
-
name
:
de
code
plugins
:
-
pluginRef
:
dyn-kv
-
pluginRef
:
decode-filter
-
pluginRef
:
dyn-decode
weight
:
1
-
pluginRef
:
picker
Frontend
:
envFromSecret
:
hf-token-secret
componentType
:
frontend
volumeMounts
:
-
name
:
model-cache
mountPoint
:
/opt/models
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/examples/backends/vllm
command
:
-
python3
args
:
-
-m
-
dynamo.frontend
-
--router-mode
-
direct
envs
:
-
name
:
HF_HOME
value
:
/opt/models
replicas
:
1
VllmPrefillWorker
:
VllmDecodeWorker
:
componentType
:
worker
envFromSecret
:
hf-token-secret
volumeMounts
:
...
...
@@ -83,6 +75,64 @@ spec:
-
-c
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/examples/backends/vllm
# Frontend sidecar
:
receives requests from kGateway on port
8000
# and routes them to the vLLM worker in the same pod
containers
:
-
name
:
frontend
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command
:
-
python3
args
:
-
-m
-
dynamo.frontend
-
--router-mode
-
direct
ports
:
-
containerPort
:
8000
name
:
http
protocol
:
TCP
envFrom
:
-
secretRef
:
name
:
hf-token-secret
env
:
-
name
:
DYNAMO_PORT
value
:
"
8000"
-
name
:
DYN_HTTP_PORT
value
:
"
8000"
-
name
:
DYN_NAMESPACE
value
:
my-model-vllm-agg
-
name
:
DYN_COMPONENT
value
:
frontend
-
name
:
DYN_DISCOVERY_BACKEND
value
:
kubernetes
-
name
:
DYN_PARENT_DGD_K8S_NAME
value
:
llama3-70b-agg
-
name
:
DYN_PARENT_DGD_K8S_NAMESPACE
value
:
my-model
-
name
:
POD_NAME
valueFrom
:
fieldRef
:
fieldPath
:
metadata.name
-
name
:
POD_NAMESPACE
valueFrom
:
fieldRef
:
fieldPath
:
metadata.namespace
-
name
:
POD_UID
valueFrom
:
fieldRef
:
fieldPath
:
metadata.uid
livenessProbe
:
httpGet
:
path
:
/live
port
:
http
initialDelaySeconds
:
15
periodSeconds
:
10
readinessProbe
:
httpGet
:
path
:
/health
port
:
http
initialDelaySeconds
:
10
periodSeconds
:
10
replicas
:
1
resources
:
limits
:
...
...
recipes/llama-3-70b/vllm/disagg-single-node/gaie/deploy.yaml
0 → 100644
View file @
c916cd42
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
llama3-70b-disagg
spec
:
backendFramework
:
vllm
pvcs
:
-
name
:
model-cache
create
:
false
services
:
Epp
:
envFromSecret
:
hf-token-secret
componentType
:
epp
replicas
:
1
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/epp-image:my-tag
eppConfig
:
config
:
plugins
:
-
type
:
disagg-profile-handler
-
name
:
prefill-filter
type
:
label-filter
parameters
:
label
:
"
nvidia.com/dynamo-sub-component-type"
validValues
:
-
"
prefill"
allowsNoLabel
:
false
-
name
:
decode-filter
type
:
label-filter
parameters
:
label
:
"
nvidia.com/dynamo-sub-component-type"
validValues
:
-
"
decode"
allowsNoLabel
:
false
-
name
:
picker
type
:
max-score-picker
-
name
:
dyn-prefill
type
:
dyn-prefill-scorer
-
name
:
dyn-decode
type
:
dyn-decode-scorer
schedulingProfiles
:
-
name
:
prefill
plugins
:
-
pluginRef
:
prefill-filter
-
pluginRef
:
dyn-prefill
weight
:
1
-
pluginRef
:
picker
-
name
:
decode
plugins
:
-
pluginRef
:
decode-filter
-
pluginRef
:
dyn-decode
weight
:
1
-
pluginRef
:
picker
VllmPrefillWorker
:
componentType
:
worker
subComponentType
:
prefill
envFromSecret
:
hf-token-secret
volumeMounts
:
-
name
:
model-cache
mountPoint
:
/opt/models
sharedMemory
:
size
:
80Gi
extraPodSpec
:
affinity
:
podAffinity
:
preferredDuringSchedulingIgnoredDuringExecution
:
-
weight
:
100
podAffinityTerm
:
labelSelector
:
matchExpressions
:
-
key
:
nvidia.com/dynamo-component-type
operator
:
In
values
:
-
worker
topologyKey
:
kubernetes.io/hostname
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
2
--data-parallel-size
1
--is-prefill-worker
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/examples/backends/vllm
# Frontend sidecar: receives requests from kGateway on port 8000
# and routes them to the vLLM worker in the same pod
containers
:
-
name
:
frontend
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command
:
-
python3
args
:
-
-m
-
dynamo.frontend
-
--router-mode
-
direct
ports
:
-
containerPort
:
8000
name
:
http
protocol
:
TCP
envFrom
:
-
secretRef
:
name
:
hf-token-secret
env
:
-
name
:
DYNAMO_PORT
value
:
"
8000"
-
name
:
DYN_HTTP_PORT
value
:
"
8000"
-
name
:
DYN_NAMESPACE
value
:
a-epp-vllm-disagg
-
name
:
DYN_COMPONENT
value
:
frontend
-
name
:
DYN_DISCOVERY_BACKEND
value
:
kubernetes
-
name
:
DYN_PARENT_DGD_K8S_NAME
value
:
llama3-70b-disagg
-
name
:
DYN_PARENT_DGD_K8S_NAMESPACE
value
:
a-epp
-
name
:
POD_NAME
valueFrom
:
fieldRef
:
fieldPath
:
metadata.name
-
name
:
POD_NAMESPACE
valueFrom
:
fieldRef
:
fieldPath
:
metadata.namespace
-
name
:
POD_UID
valueFrom
:
fieldRef
:
fieldPath
:
metadata.uid
livenessProbe
:
httpGet
:
path
:
/live
port
:
http
initialDelaySeconds
:
15
periodSeconds
:
10
readinessProbe
:
httpGet
:
path
:
/health
port
:
http
initialDelaySeconds
:
10
periodSeconds
:
10
replicas
:
2
resources
:
limits
:
gpu
:
"
2"
requests
:
gpu
:
"
2"
VllmDecodeWorker
:
componentType
:
worker
subComponentType
:
decode
envFromSecret
:
hf-token-secret
volumeMounts
:
-
name
:
model-cache
mountPoint
:
/opt/models
sharedMemory
:
size
:
80Gi
extraPodSpec
:
affinity
:
podAffinity
:
preferredDuringSchedulingIgnoredDuringExecution
:
-
weight
:
100
podAffinityTerm
:
labelSelector
:
matchExpressions
:
-
key
:
nvidia.com/dynamo-component-type
operator
:
In
values
:
-
worker
topologyKey
:
kubernetes.io/hostname
mainContainer
:
env
:
-
name
:
SERVED_MODEL_NAME
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
MODEL_PATH
value
:
"
RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir
:
/workspace/examples/backends/vllm
# Frontend sidecar: receives requests from kGateway on port 8000
# and routes them to the vLLM worker in the same pod
containers
:
-
name
:
frontend
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command
:
-
python3
args
:
-
-m
-
dynamo.frontend
-
--router-mode
-
direct
ports
:
-
containerPort
:
8000
name
:
http
protocol
:
TCP
envFrom
:
-
secretRef
:
name
:
hf-token-secret
env
:
-
name
:
DYNAMO_PORT
value
:
"
8000"
-
name
:
DYN_HTTP_PORT
value
:
"
8000"
-
name
:
DYN_NAMESPACE
value
:
a-epp-vllm-disagg
-
name
:
DYN_COMPONENT
value
:
frontend
-
name
:
DYN_DISCOVERY_BACKEND
value
:
kubernetes
-
name
:
DYN_PARENT_DGD_K8S_NAME
value
:
llama3-70b-disagg
-
name
:
DYN_PARENT_DGD_K8S_NAMESPACE
value
:
a-epp
-
name
:
POD_NAME
valueFrom
:
fieldRef
:
fieldPath
:
metadata.name
-
name
:
POD_NAMESPACE
valueFrom
:
fieldRef
:
fieldPath
:
metadata.namespace
-
name
:
POD_UID
valueFrom
:
fieldRef
:
fieldPath
:
metadata.uid
livenessProbe
:
httpGet
:
path
:
/live
port
:
http
initialDelaySeconds
:
15
periodSeconds
:
10
readinessProbe
:
httpGet
:
path
:
/health
port
:
http
initialDelaySeconds
:
10
periodSeconds
:
10
replicas
:
1
resources
:
limits
:
gpu
:
"
4"
requests
:
gpu
:
"
4"
\ No newline at end of file
recipes/llama-3-70b/vllm/disagg-single-node/gaie/http-route.yaml
0 → 100644
View file @
c916cd42
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove metadata.namespace if using kubectl apply -n
# The backendRefs.namespace field should match where your InferencePool is deployed
apiVersion
:
gateway.networking.k8s.io/v1
kind
:
HTTPRoute
metadata
:
name
:
llama3-70b-disagg-route
spec
:
hostnames
:
-
llama3-70b-disagg.example.com
parentRefs
:
-
group
:
gateway.networking.k8s.io
kind
:
Gateway
name
:
inference-gateway
namespace
:
kgateway-system
rules
:
-
backendRefs
:
-
group
:
inference.networking.k8s.io
kind
:
InferencePool
name
:
llama3-70b-disagg-pool
port
:
8000
weight
:
1
matches
:
-
path
:
type
:
PathPrefix
value
:
/
timeouts
:
request
:
300s
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment