Unverified Commit c916cd42 authored by atchernych's avatar atchernych Committed by GitHub
Browse files

feat: Support epp's "pods" interface in Dynamo fixes [DEP-424] (#6302)


Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
parent 5a4c96db
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::collections::HashSet;
use std::sync::{Arc, OnceLock};
use anyhow::Result;
......@@ -20,6 +21,7 @@ use dynamo_runtime::{
use crate::{
discovery::ModelManager,
kv_router::protocols::WorkerId,
kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride, protocols::BlockExtraInfo},
protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
......@@ -305,6 +307,7 @@ impl PrefillRouter {
false,
lora_name,
priority_jump,
None,
)
.await
{
......@@ -505,6 +508,7 @@ impl PrefillRouter {
update_states: bool,
lora_name: Option<String>,
priority_jump: f64,
allowed_worker_ids: Option<HashSet<WorkerId>>,
) -> Result<(u64, u32)> {
let prefill_router = self
.prefill_router
......@@ -523,6 +527,7 @@ impl PrefillRouter {
update_states,
lora_name,
priority_jump,
allowed_worker_ids,
)
.await?;
Ok((worker.worker_id, worker.dp_rank))
......
......@@ -217,6 +217,7 @@ impl KvPushRouter {
!is_query_only,
lora_name,
priority_jump,
None,
)
.await?;
......
......@@ -63,6 +63,8 @@ pub struct SchedulingRequest {
pub lora_name: Option<String>,
/// Priority jump in seconds; decreases effective arrival time in the queue.
pub priority_jump: f64,
/// Optional set of allowed worker IDs to restrict routing decisions (EPP).
pub allowed_worker_ids: Option<HashSet<WorkerId>>,
resp_tx: Option<tokio::sync::oneshot::Sender<Result<SchedulingResponse, KvSchedulerError>>>,
}
......@@ -204,7 +206,8 @@ impl KvScheduler {
update_states: bool,
lora_name: Option<String>,
priority_jump: f64,
) -> Result<WorkerWithDpRank, KvSchedulerError> {
allowed_worker_ids: Option<HashSet<WorkerId>>,
) -> Result<SchedulingResponse, KvSchedulerError> {
#[cfg(feature = "bench")]
let start = Instant::now();
......@@ -220,6 +223,7 @@ impl KvScheduler {
update_states,
lora_name,
priority_jump,
allowed_worker_ids,
resp_tx: Some(resp_tx),
};
......@@ -245,7 +249,7 @@ impl KvScheduler {
"scheduler.schedule completed"
);
Ok(response.best_worker)
Ok(response)
}
pub async fn add_request(&self, req: SequenceRequest) -> Result<(), SequenceError> {
......@@ -404,7 +408,11 @@ impl WorkerSelector for DefaultWorkerSelector {
) -> Result<WorkerSelectionResult, KvSchedulerError> {
assert!(request.isl_tokens > 0);
if workers.is_empty() {
let allowed_ids = request.allowed_worker_ids.as_ref();
if allowed_ids.map_or(workers.is_empty(), |ids| {
!workers.keys().any(|wid| ids.contains(wid))
}) {
return Err(KvSchedulerError::NoEndpoints);
}
......@@ -424,10 +432,10 @@ impl WorkerSelector for DefaultWorkerSelector {
.and_then(|cfg| cfg.overlap_score_weight)
.unwrap_or(self.kv_router_config.overlap_score_weight);
// Calculate logits for each worker with dp_rank
// Outer loop: iterate over all workers from runtime config
// Inner loop: iterate over all dp_ranks for each worker
for (worker_id, config) in workers.iter() {
for (worker_id, config) in workers
.iter()
.filter(|(wid, _)| allowed_ids.is_none_or(|ids| ids.contains(wid)))
{
let data_parallel_size = config.data_parallel_size;
for dp_rank in 0..data_parallel_size {
......
......@@ -18,46 +18,38 @@ spec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/frontend:my-tag
eppConfig:
# This configuration uses Dynamo's KV-aware scorer for intelligent routing
# This config uses the same disagg-profile-handler as disaggregated deployments.
# The handler's graceful degradation feature makes this possible:
# - With no "prefill" profile defined, it runs only the "decode" profile.
# - The decode scorer receives isDisaggregated=false, so the Dynamo KV router
# uses full overlap scoring (overlap_score_weight=1.0) for aggregated mode.
# - If prefill workers were added later (and a prefill profile configured),
# the same handler would automatically switch to disaggregated routing.
config:
# Plugins define the behavior of EPP
plugins:
# Required: tells EPP which profile to use (even if you only have one)
- type: single-profile-handler
# Picker: chooses the final endpoint after scoring
- type: disagg-profile-handler
- name: decode-filter
type: label-filter
# allowsNoLabel: true lets pods without the subComponentType label pass through,
# which is typical for aggregated deployments where workers don't have this label.
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "decode"
allowsNoLabel: true
- name: picker
type: max-score-picker
- name: dyn-kv
type: kv-aware-scorer
# Scheduling profiles configure which plugins are used and their weights
- name: dyn-decode
type: dyn-decode-scorer
# Only a "decode" profile — no "prefill" profile means pure aggregated mode.
schedulingProfiles:
- name: default
- name: decode
plugins:
- pluginRef: dyn-kv
- pluginRef: decode-filter
- pluginRef: dyn-decode
weight: 1
- pluginRef: picker
Frontend:
envFromSecret: hf-token-secret
componentType: frontend
volumeMounts:
- name: model-cache
mountPoint: /opt/models
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
args:
- -m
- dynamo.frontend
- --router-mode
- direct
envs:
- name: HF_HOME
value: /opt/models
replicas: 1
VllmPrefillWorker:
VllmDecodeWorker:
componentType: worker
envFromSecret: hf-token-secret
volumeMounts:
......@@ -83,6 +75,64 @@ spec:
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
# Frontend sidecar: receives requests from kGateway on port 8000
# and routes them to the vLLM worker in the same pod
containers:
- name: frontend
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command:
- python3
args:
- -m
- dynamo.frontend
- --router-mode
- direct
ports:
- containerPort: 8000
name: http
protocol: TCP
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: DYNAMO_PORT
value: "8000"
- name: DYN_HTTP_PORT
value: "8000"
- name: DYN_NAMESPACE
value: my-model-vllm-agg
- name: DYN_COMPONENT
value: frontend
- name: DYN_DISCOVERY_BACKEND
value: kubernetes
- name: DYN_PARENT_DGD_K8S_NAME
value: llama3-70b-agg
- name: DYN_PARENT_DGD_K8S_NAMESPACE
value: my-model
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
livenessProbe:
httpGet:
path: /live
port: http
initialDelaySeconds: 15
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
replicas: 1
resources:
limits:
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llama3-70b-disagg
spec:
backendFramework: vllm
pvcs:
- name: model-cache
create: false
services:
Epp:
envFromSecret: hf-token-secret
componentType: epp
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/epp-image:my-tag
eppConfig:
config:
plugins:
- type: disagg-profile-handler
- name: prefill-filter
type: label-filter
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "prefill"
allowsNoLabel: false
- name: decode-filter
type: label-filter
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "decode"
allowsNoLabel: false
- name: picker
type: max-score-picker
- name: dyn-prefill
type: dyn-prefill-scorer
- name: dyn-decode
type: dyn-decode-scorer
schedulingProfiles:
- name: prefill
plugins:
- pluginRef: prefill-filter
- pluginRef: dyn-prefill
weight: 1
- pluginRef: picker
- name: decode
plugins:
- pluginRef: decode-filter
- pluginRef: dyn-decode
weight: 1
- pluginRef: picker
VllmPrefillWorker:
componentType: worker
subComponentType: prefill
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache
mountPoint: /opt/models
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
# Frontend sidecar: receives requests from kGateway on port 8000
# and routes them to the vLLM worker in the same pod
containers:
- name: frontend
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command:
- python3
args:
- -m
- dynamo.frontend
- --router-mode
- direct
ports:
- containerPort: 8000
name: http
protocol: TCP
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: DYNAMO_PORT
value: "8000"
- name: DYN_HTTP_PORT
value: "8000"
- name: DYN_NAMESPACE
value: a-epp-vllm-disagg
- name: DYN_COMPONENT
value: frontend
- name: DYN_DISCOVERY_BACKEND
value: kubernetes
- name: DYN_PARENT_DGD_K8S_NAME
value: llama3-70b-disagg
- name: DYN_PARENT_DGD_K8S_NAMESPACE
value: a-epp
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
livenessProbe:
httpGet:
path: /live
port: http
initialDelaySeconds: 15
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
replicas: 2
resources:
limits:
gpu: "2"
requests:
gpu: "2"
VllmDecodeWorker:
componentType: worker
subComponentType: decode
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache
mountPoint: /opt/models
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
# Frontend sidecar: receives requests from kGateway on port 8000
# and routes them to the vLLM worker in the same pod
containers:
- name: frontend
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command:
- python3
args:
- -m
- dynamo.frontend
- --router-mode
- direct
ports:
- containerPort: 8000
name: http
protocol: TCP
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: DYNAMO_PORT
value: "8000"
- name: DYN_HTTP_PORT
value: "8000"
- name: DYN_NAMESPACE
value: a-epp-vllm-disagg
- name: DYN_COMPONENT
value: frontend
- name: DYN_DISCOVERY_BACKEND
value: kubernetes
- name: DYN_PARENT_DGD_K8S_NAME
value: llama3-70b-disagg
- name: DYN_PARENT_DGD_K8S_NAMESPACE
value: a-epp
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
livenessProbe:
httpGet:
path: /live
port: http
initialDelaySeconds: 15
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
replicas: 1
resources:
limits:
gpu: "4"
requests:
gpu: "4"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove metadata.namespace if using kubectl apply -n
# The backendRefs.namespace field should match where your InferencePool is deployed
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: llama3-70b-disagg-route
spec:
hostnames:
- llama3-70b-disagg.example.com
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: inference-gateway
namespace: kgateway-system
rules:
- backendRefs:
- group: inference.networking.k8s.io
kind: InferencePool
name: llama3-70b-disagg-pool
port: 8000
weight: 1
matches:
- path:
type: PathPrefix
value: /
timeouts:
request: 300s
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment