"vscode:/vscode.git/clone" did not exist on "1b32e0264888200a0e6187496a816ef597a7f320"
Unverified Commit c916cd42 authored by atchernych's avatar atchernych Committed by GitHub
Browse files

feat: Support epp's "pods" interface in Dynamo fixes [DEP-424] (#6302)


Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
parent 5a4c96db
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use std::collections::HashSet;
use std::sync::{Arc, OnceLock}; use std::sync::{Arc, OnceLock};
use anyhow::Result; use anyhow::Result;
...@@ -20,6 +21,7 @@ use dynamo_runtime::{ ...@@ -20,6 +21,7 @@ use dynamo_runtime::{
use crate::{ use crate::{
discovery::ModelManager, discovery::ModelManager,
kv_router::protocols::WorkerId,
kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride, protocols::BlockExtraInfo}, kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride, protocols::BlockExtraInfo},
protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest}, protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
protocols::common::preprocessor::{BootstrapInfo, PrefillResult}, protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
...@@ -305,6 +307,7 @@ impl PrefillRouter { ...@@ -305,6 +307,7 @@ impl PrefillRouter {
false, false,
lora_name, lora_name,
priority_jump, priority_jump,
None,
) )
.await .await
{ {
...@@ -505,6 +508,7 @@ impl PrefillRouter { ...@@ -505,6 +508,7 @@ impl PrefillRouter {
update_states: bool, update_states: bool,
lora_name: Option<String>, lora_name: Option<String>,
priority_jump: f64, priority_jump: f64,
allowed_worker_ids: Option<HashSet<WorkerId>>,
) -> Result<(u64, u32)> { ) -> Result<(u64, u32)> {
let prefill_router = self let prefill_router = self
.prefill_router .prefill_router
...@@ -523,6 +527,7 @@ impl PrefillRouter { ...@@ -523,6 +527,7 @@ impl PrefillRouter {
update_states, update_states,
lora_name, lora_name,
priority_jump, priority_jump,
allowed_worker_ids,
) )
.await?; .await?;
Ok((worker.worker_id, worker.dp_rank)) Ok((worker.worker_id, worker.dp_rank))
......
...@@ -217,6 +217,7 @@ impl KvPushRouter { ...@@ -217,6 +217,7 @@ impl KvPushRouter {
!is_query_only, !is_query_only,
lora_name, lora_name,
priority_jump, priority_jump,
None,
) )
.await?; .await?;
......
...@@ -63,6 +63,8 @@ pub struct SchedulingRequest { ...@@ -63,6 +63,8 @@ pub struct SchedulingRequest {
pub lora_name: Option<String>, pub lora_name: Option<String>,
/// Priority jump in seconds; decreases effective arrival time in the queue. /// Priority jump in seconds; decreases effective arrival time in the queue.
pub priority_jump: f64, pub priority_jump: f64,
/// Optional set of allowed worker IDs to restrict routing decisions (EPP).
pub allowed_worker_ids: Option<HashSet<WorkerId>>,
resp_tx: Option<tokio::sync::oneshot::Sender<Result<SchedulingResponse, KvSchedulerError>>>, resp_tx: Option<tokio::sync::oneshot::Sender<Result<SchedulingResponse, KvSchedulerError>>>,
} }
...@@ -204,7 +206,8 @@ impl KvScheduler { ...@@ -204,7 +206,8 @@ impl KvScheduler {
update_states: bool, update_states: bool,
lora_name: Option<String>, lora_name: Option<String>,
priority_jump: f64, priority_jump: f64,
) -> Result<WorkerWithDpRank, KvSchedulerError> { allowed_worker_ids: Option<HashSet<WorkerId>>,
) -> Result<SchedulingResponse, KvSchedulerError> {
#[cfg(feature = "bench")] #[cfg(feature = "bench")]
let start = Instant::now(); let start = Instant::now();
...@@ -220,6 +223,7 @@ impl KvScheduler { ...@@ -220,6 +223,7 @@ impl KvScheduler {
update_states, update_states,
lora_name, lora_name,
priority_jump, priority_jump,
allowed_worker_ids,
resp_tx: Some(resp_tx), resp_tx: Some(resp_tx),
}; };
...@@ -245,7 +249,7 @@ impl KvScheduler { ...@@ -245,7 +249,7 @@ impl KvScheduler {
"scheduler.schedule completed" "scheduler.schedule completed"
); );
Ok(response.best_worker) Ok(response)
} }
pub async fn add_request(&self, req: SequenceRequest) -> Result<(), SequenceError> { pub async fn add_request(&self, req: SequenceRequest) -> Result<(), SequenceError> {
...@@ -404,7 +408,11 @@ impl WorkerSelector for DefaultWorkerSelector { ...@@ -404,7 +408,11 @@ impl WorkerSelector for DefaultWorkerSelector {
) -> Result<WorkerSelectionResult, KvSchedulerError> { ) -> Result<WorkerSelectionResult, KvSchedulerError> {
assert!(request.isl_tokens > 0); assert!(request.isl_tokens > 0);
if workers.is_empty() { let allowed_ids = request.allowed_worker_ids.as_ref();
if allowed_ids.map_or(workers.is_empty(), |ids| {
!workers.keys().any(|wid| ids.contains(wid))
}) {
return Err(KvSchedulerError::NoEndpoints); return Err(KvSchedulerError::NoEndpoints);
} }
...@@ -424,10 +432,10 @@ impl WorkerSelector for DefaultWorkerSelector { ...@@ -424,10 +432,10 @@ impl WorkerSelector for DefaultWorkerSelector {
.and_then(|cfg| cfg.overlap_score_weight) .and_then(|cfg| cfg.overlap_score_weight)
.unwrap_or(self.kv_router_config.overlap_score_weight); .unwrap_or(self.kv_router_config.overlap_score_weight);
// Calculate logits for each worker with dp_rank for (worker_id, config) in workers
// Outer loop: iterate over all workers from runtime config .iter()
// Inner loop: iterate over all dp_ranks for each worker .filter(|(wid, _)| allowed_ids.is_none_or(|ids| ids.contains(wid)))
for (worker_id, config) in workers.iter() { {
let data_parallel_size = config.data_parallel_size; let data_parallel_size = config.data_parallel_size;
for dp_rank in 0..data_parallel_size { for dp_rank in 0..data_parallel_size {
......
...@@ -18,46 +18,38 @@ spec: ...@@ -18,46 +18,38 @@ spec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/frontend:my-tag image: nvcr.io/nvidia/ai-dynamo/frontend:my-tag
eppConfig: eppConfig:
# This configuration uses Dynamo's KV-aware scorer for intelligent routing # This config uses the same disagg-profile-handler as disaggregated deployments.
# The handler's graceful degradation feature makes this possible:
# - With no "prefill" profile defined, it runs only the "decode" profile.
# - The decode scorer receives isDisaggregated=false, so the Dynamo KV router
# uses full overlap scoring (overlap_score_weight=1.0) for aggregated mode.
# - If prefill workers were added later (and a prefill profile configured),
# the same handler would automatically switch to disaggregated routing.
config: config:
# Plugins define the behavior of EPP
plugins: plugins:
# Required: tells EPP which profile to use (even if you only have one) - type: disagg-profile-handler
- type: single-profile-handler - name: decode-filter
# Picker: chooses the final endpoint after scoring type: label-filter
# allowsNoLabel: true lets pods without the subComponentType label pass through,
# which is typical for aggregated deployments where workers don't have this label.
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "decode"
allowsNoLabel: true
- name: picker - name: picker
type: max-score-picker type: max-score-picker
- name: dyn-kv - name: dyn-decode
type: kv-aware-scorer type: dyn-decode-scorer
# Scheduling profiles configure which plugins are used and their weights # Only a "decode" profile — no "prefill" profile means pure aggregated mode.
schedulingProfiles: schedulingProfiles:
- name: default - name: decode
plugins: plugins:
- pluginRef: dyn-kv - pluginRef: decode-filter
- pluginRef: dyn-decode
weight: 1 weight: 1
- pluginRef: picker - pluginRef: picker
Frontend: VllmDecodeWorker:
envFromSecret: hf-token-secret
componentType: frontend
volumeMounts:
- name: model-cache
mountPoint: /opt/models
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
args:
- -m
- dynamo.frontend
- --router-mode
- direct
envs:
- name: HF_HOME
value: /opt/models
replicas: 1
VllmPrefillWorker:
componentType: worker componentType: worker
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
...@@ -83,6 +75,64 @@ spec: ...@@ -83,6 +75,64 @@ spec:
- -c - -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
# Frontend sidecar: receives requests from kGateway on port 8000
# and routes them to the vLLM worker in the same pod
containers:
- name: frontend
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command:
- python3
args:
- -m
- dynamo.frontend
- --router-mode
- direct
ports:
- containerPort: 8000
name: http
protocol: TCP
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: DYNAMO_PORT
value: "8000"
- name: DYN_HTTP_PORT
value: "8000"
- name: DYN_NAMESPACE
value: my-model-vllm-agg
- name: DYN_COMPONENT
value: frontend
- name: DYN_DISCOVERY_BACKEND
value: kubernetes
- name: DYN_PARENT_DGD_K8S_NAME
value: llama3-70b-agg
- name: DYN_PARENT_DGD_K8S_NAMESPACE
value: my-model
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
livenessProbe:
httpGet:
path: /live
port: http
initialDelaySeconds: 15
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
replicas: 1 replicas: 1
resources: resources:
limits: limits:
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llama3-70b-disagg
spec:
backendFramework: vllm
pvcs:
- name: model-cache
create: false
services:
Epp:
envFromSecret: hf-token-secret
componentType: epp
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/epp-image:my-tag
eppConfig:
config:
plugins:
- type: disagg-profile-handler
- name: prefill-filter
type: label-filter
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "prefill"
allowsNoLabel: false
- name: decode-filter
type: label-filter
parameters:
label: "nvidia.com/dynamo-sub-component-type"
validValues:
- "decode"
allowsNoLabel: false
- name: picker
type: max-score-picker
- name: dyn-prefill
type: dyn-prefill-scorer
- name: dyn-decode
type: dyn-decode-scorer
schedulingProfiles:
- name: prefill
plugins:
- pluginRef: prefill-filter
- pluginRef: dyn-prefill
weight: 1
- pluginRef: picker
- name: decode
plugins:
- pluginRef: decode-filter
- pluginRef: dyn-decode
weight: 1
- pluginRef: picker
VllmPrefillWorker:
componentType: worker
subComponentType: prefill
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache
mountPoint: /opt/models
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
# Frontend sidecar: receives requests from kGateway on port 8000
# and routes them to the vLLM worker in the same pod
containers:
- name: frontend
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command:
- python3
args:
- -m
- dynamo.frontend
- --router-mode
- direct
ports:
- containerPort: 8000
name: http
protocol: TCP
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: DYNAMO_PORT
value: "8000"
- name: DYN_HTTP_PORT
value: "8000"
- name: DYN_NAMESPACE
value: a-epp-vllm-disagg
- name: DYN_COMPONENT
value: frontend
- name: DYN_DISCOVERY_BACKEND
value: kubernetes
- name: DYN_PARENT_DGD_K8S_NAME
value: llama3-70b-disagg
- name: DYN_PARENT_DGD_K8S_NAMESPACE
value: a-epp
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
livenessProbe:
httpGet:
path: /live
port: http
initialDelaySeconds: 15
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
replicas: 2
resources:
limits:
gpu: "2"
requests:
gpu: "2"
VllmDecodeWorker:
componentType: worker
subComponentType: decode
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache
mountPoint: /opt/models
sharedMemory:
size: 80Gi
extraPodSpec:
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-component-type
operator: In
values:
- worker
topologyKey: kubernetes.io/hostname
mainContainer:
env:
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
# Frontend sidecar: receives requests from kGateway on port 8000
# and routes them to the vLLM worker in the same pod
containers:
- name: frontend
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
command:
- python3
args:
- -m
- dynamo.frontend
- --router-mode
- direct
ports:
- containerPort: 8000
name: http
protocol: TCP
envFrom:
- secretRef:
name: hf-token-secret
env:
- name: DYNAMO_PORT
value: "8000"
- name: DYN_HTTP_PORT
value: "8000"
- name: DYN_NAMESPACE
value: a-epp-vllm-disagg
- name: DYN_COMPONENT
value: frontend
- name: DYN_DISCOVERY_BACKEND
value: kubernetes
- name: DYN_PARENT_DGD_K8S_NAME
value: llama3-70b-disagg
- name: DYN_PARENT_DGD_K8S_NAMESPACE
value: a-epp
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
livenessProbe:
httpGet:
path: /live
port: http
initialDelaySeconds: 15
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
replicas: 1
resources:
limits:
gpu: "4"
requests:
gpu: "4"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove metadata.namespace if using kubectl apply -n
# The backendRefs.namespace field should match where your InferencePool is deployed
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: llama3-70b-disagg-route
spec:
hostnames:
- llama3-70b-disagg.example.com
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: inference-gateway
namespace: kgateway-system
rules:
- backendRefs:
- group: inference.networking.k8s.io
kind: InferencePool
name: llama3-70b-disagg-pool
port: 8000
weight: 1
matches:
- path:
type: PathPrefix
value: /
timeouts:
request: 300s
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment