feat: Support epp's "pods" interface in Dynamo fixes [DEP-424] (#6302)

Signed-off-by: Anna Tchernych <atchernych@nvidia.com>

feat: Support epp's "pods" interface in Dynamo fixes [DEP-424] (#6302)
Signed-off-by: Anna Tchernych <atchernych@nvidia.com>
c916cd42 · atchernych · GitHub · 5a4c96db · c916cd42 · c916cd42
Unverified Commit c916cd42 authored Feb 24, 2026 by atchernych Committed by GitHub Feb 25, 2026
6 changed files
--- a/lib/llm/src/kv_router/prefill_router.rs
+++ b/lib/llm/src/kv_router/prefill_router.rs
 // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
+use std::collections::HashSet;
 use std::sync::{Arc, OnceLock};
 use anyhow::Result;
@@ -20,6 +21,7 @@ use dynamo_runtime::{
 use crate::{
    discovery::ModelManager,
+    kv_router::protocols::WorkerId,
    kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride, protocols::BlockExtraInfo},
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
@@ -305,6 +307,7 @@ impl PrefillRouter {
                    false,
                    lora_name,
                    priority_jump,
+                    None,
                )
                .await
            {
@@ -505,6 +508,7 @@ impl PrefillRouter {
        update_states: bool,
        lora_name: Option<String>,
        priority_jump: f64,
+        allowed_worker_ids: Option<HashSet<WorkerId>>,
    ) -> Result<(u64, u32)> {
        let prefill_router = self
            .prefill_router
@@ -523,6 +527,7 @@ impl PrefillRouter {
                        update_states,
                        lora_name,
                        priority_jump,
+                        allowed_worker_ids,
                    )
                    .await?;
                Ok((worker.worker_id, worker.dp_rank))

--- a/lib/llm/src/kv_router/push_router.rs
+++ b/lib/llm/src/kv_router/push_router.rs
@@ -217,6 +217,7 @@ impl KvPushRouter {
                    !is_query_only,
                    lora_name,
                    priority_jump,
+                    None,
                )
                .await?;

--- a/lib/llm/src/kv_router/scheduler.rs
+++ b/lib/llm/src/kv_router/scheduler.rs
@@ -63,6 +63,8 @@ pub struct SchedulingRequest {
    pub lora_name: Option<String>,
    /// Priority jump in seconds; decreases effective arrival time in the queue.
    pub priority_jump: f64,
+    /// Optional set of allowed worker IDs to restrict routing decisions (EPP).
+    pub allowed_worker_ids: Option<HashSet<WorkerId>>,
    resp_tx: Option<tokio::sync::oneshot::Sender<Result<SchedulingResponse, KvSchedulerError>>>,
 }
@@ -204,7 +206,8 @@ impl KvScheduler {
        update_states: bool,
        lora_name: Option<String>,
        priority_jump: f64,
-    ) -> Result<WorkerWithDpRank, KvSchedulerError> {
+        allowed_worker_ids: Option<HashSet<WorkerId>>,
+    ) -> Result<SchedulingResponse, KvSchedulerError> {
        #[cfg(feature = "bench")]
        let start = Instant::now();
@@ -220,6 +223,7 @@ impl KvScheduler {
            update_states,
            lora_name,
            priority_jump,
+            allowed_worker_ids,
            resp_tx: Some(resp_tx),
        };
@@ -245,7 +249,7 @@ impl KvScheduler {
            "scheduler.schedule completed"
        );
-        Ok(response.best_worker)
+        Ok(response)
    }
    pub async fn add_request(&self, req: SequenceRequest) -> Result<(), SequenceError> {
@@ -404,7 +408,11 @@ impl WorkerSelector for DefaultWorkerSelector {
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);
-        if workers.is_empty() {
+        let allowed_ids = request.allowed_worker_ids.as_ref();
+        if allowed_ids.map_or(workers.is_empty(), |ids| {
+            !workers.keys().any(|wid| ids.contains(wid))
+        }) {
            return Err(KvSchedulerError::NoEndpoints);
        }
@@ -424,10 +432,10 @@ impl WorkerSelector for DefaultWorkerSelector {
            .and_then(|cfg| cfg.overlap_score_weight)
            .unwrap_or(self.kv_router_config.overlap_score_weight);
-        // Calculate logits for each worker with dp_rank
+        for (worker_id, config) in workers
-        // Outer loop: iterate over all workers from runtime config
+            .iter()
-        // Inner loop: iterate over all dp_ranks for each worker
+            .filter(|(wid, _)| allowed_ids.is_none_or(|ids| ids.contains(wid)))
-        for (worker_id, config) in workers.iter() {
+        {
            let data_parallel_size = config.data_parallel_size;
            for dp_rank in 0..data_parallel_size {

--- a/recipes/llama-3-70b/vllm/agg/gaie/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/deploy.yaml
@@ -18,46 +18,38 @@ spec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/frontend:my-tag
      eppConfig:
-        # This configuration uses Dynamo's KV-aware scorer for intelligent routing
+        # This config uses the same disagg-profile-handler as disaggregated deployments.
+        # The handler's graceful degradation feature makes this possible:
+        # - With no "prefill" profile defined, it runs only the "decode" profile.
+        # - The decode scorer receives isDisaggregated=false, so the Dynamo KV router
+        #   uses full overlap scoring (overlap_score_weight=1.0) for aggregated mode.
+        # - If prefill workers were added later (and a prefill profile configured),
+        #   the same handler would automatically switch to disaggregated routing.
        config:
-          # Plugins define the behavior of EPP
          plugins:
-            # Required: tells EPP which profile to use (even if you only have one)
+            - type: disagg-profile-handler
-            - type: single-profile-handler
+            - name: decode-filter
-            # Picker: chooses the final endpoint after scoring
+              type: label-filter
+              # allowsNoLabel: true lets pods without the subComponentType label pass through,
+              # which is typical for aggregated deployments where workers don't have this label.
+              parameters:
+                label: "nvidia.com/dynamo-sub-component-type"
+                validValues:
+                  - "decode"
+                allowsNoLabel: true
            - name: picker
              type: max-score-picker
-            - name: dyn-kv
+            - name: dyn-decode
-              type: kv-aware-scorer
+              type: dyn-decode-scorer
-          # Scheduling profiles configure which plugins are used and their weights
+          # Only a "decode" profile — no "prefill" profile means pure aggregated mode.
          schedulingProfiles:
-            - name: default
+            - name: decode
              plugins:
-                - pluginRef: dyn-kv
+                - pluginRef: decode-filter
+                - pluginRef: dyn-decode
                  weight: 1
                - pluginRef: picker
-    Frontend:
+    VllmDecodeWorker:
-      envFromSecret: hf-token-secret
-      componentType: frontend
-      volumeMounts:
-        - name: model-cache
-          mountPoint: /opt/models
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
-          workingDir: /workspace/examples/backends/vllm
-          command:
-            - python3
-          args:
-            - -m
-            - dynamo.frontend
-            - --router-mode
-            - direct
-      envs:
-        - name: HF_HOME
-          value: /opt/models
-      replicas: 1
-    VllmPrefillWorker:
      componentType: worker
      envFromSecret: hf-token-secret
      volumeMounts:
@@ -83,6 +75,64 @@ spec:
          - -c
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
          workingDir: /workspace/examples/backends/vllm
+              # Frontend sidecar: receives requests from kGateway on port 8000
+        # and routes them to the vLLM worker in the same pod
+        containers:
+          - name: frontend
+            image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+            command:
+              - python3
+            args:
+              - -m
+              - dynamo.frontend
+              - --router-mode
+              - direct
+            ports:
+              - containerPort: 8000
+                name: http
+                protocol: TCP
+            envFrom:
+              - secretRef:
+                  name: hf-token-secret
+            env:
+              - name: DYNAMO_PORT
+                value: "8000"
+              - name: DYN_HTTP_PORT
+                value: "8000"
+              - name: DYN_NAMESPACE
+                value: my-model-vllm-agg
+              - name: DYN_COMPONENT
+                value: frontend
+              - name: DYN_DISCOVERY_BACKEND
+                value: kubernetes
+              - name: DYN_PARENT_DGD_K8S_NAME
+                value: llama3-70b-agg
+              - name: DYN_PARENT_DGD_K8S_NAMESPACE
+                value: my-model
+              - name: POD_NAME
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.name
+              - name: POD_NAMESPACE
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.namespace
+              - name: POD_UID
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.uid
+            livenessProbe:
+              httpGet:
+                path: /live
+                port: http
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            readinessProbe:
+              httpGet:
+                path: /health
+                port: http
+              initialDelaySeconds: 10
+              periodSeconds: 10
      replicas: 1
      resources:
        limits:

--- a/recipes/llama-3-70b/vllm/disagg-single-node/gaie/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-single-node/gaie/deploy.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: llama3-70b-disagg
+spec:
+  backendFramework: vllm
+  pvcs:
+    - name: model-cache
+      create: false
+  services:
+    Epp:
+      envFromSecret: hf-token-secret
+      componentType: epp
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/epp-image:my-tag
+      eppConfig:
+        config:
+          plugins:
+            - type: disagg-profile-handler
+            - name: prefill-filter
+              type: label-filter
+              parameters:
+                label: "nvidia.com/dynamo-sub-component-type"
+                validValues:
+                  - "prefill"
+                allowsNoLabel: false
+            - name: decode-filter
+              type: label-filter
+              parameters:
+                label: "nvidia.com/dynamo-sub-component-type"
+                validValues:
+                  - "decode"
+                allowsNoLabel: false
+            - name: picker
+              type: max-score-picker
+            - name: dyn-prefill
+              type: dyn-prefill-scorer
+            - name: dyn-decode
+              type: dyn-decode-scorer
+          schedulingProfiles:
+          - name: prefill
+            plugins:
+            - pluginRef: prefill-filter
+            - pluginRef: dyn-prefill
+              weight: 1
+            - pluginRef: picker
+          - name: decode
+            plugins:
+            - pluginRef: decode-filter
+            - pluginRef: dyn-decode
+              weight: 1
+            - pluginRef: picker
+    VllmPrefillWorker:
+      componentType: worker
+      subComponentType: prefill
+      envFromSecret: hf-token-secret
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /opt/models
+      sharedMemory:
+        size: 80Gi
+      extraPodSpec:
+        affinity:
+          podAffinity:
+            preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              podAffinityTerm:
+                labelSelector:
+                  matchExpressions:
+                  - key: nvidia.com/dynamo-component-type
+                    operator: In
+                    values:
+                    - worker
+                topologyKey: kubernetes.io/hostname
+        mainContainer:
+          env:
+            - name: SERVED_MODEL_NAME
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: MODEL_PATH
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: HF_HOME
+              value: /opt/models
+          args:
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          command:
+          - /bin/sh
+          - -c
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/examples/backends/vllm
+        # Frontend sidecar: receives requests from kGateway on port 8000
+        # and routes them to the vLLM worker in the same pod
+        containers:
+          - name: frontend
+            image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+            command:
+              - python3
+            args:
+              - -m
+              - dynamo.frontend
+              - --router-mode
+              - direct
+            ports:
+              - containerPort: 8000
+                name: http
+                protocol: TCP
+            envFrom:
+              - secretRef:
+                  name: hf-token-secret
+            env:
+              - name: DYNAMO_PORT
+                value: "8000"
+              - name: DYN_HTTP_PORT
+                value: "8000"
+              - name: DYN_NAMESPACE
+                value: a-epp-vllm-disagg
+              - name: DYN_COMPONENT
+                value: frontend
+              - name: DYN_DISCOVERY_BACKEND
+                value: kubernetes
+              - name: DYN_PARENT_DGD_K8S_NAME
+                value: llama3-70b-disagg
+              - name: DYN_PARENT_DGD_K8S_NAMESPACE
+                value: a-epp
+              - name: POD_NAME
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.name
+              - name: POD_NAMESPACE
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.namespace
+              - name: POD_UID
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.uid
+            livenessProbe:
+              httpGet:
+                path: /live
+                port: http
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            readinessProbe:
+              httpGet:
+                path: /health
+                port: http
+              initialDelaySeconds: 10
+              periodSeconds: 10
+      replicas: 2
+      resources:
+        limits:
+          gpu: "2"
+        requests:
+          gpu: "2"
+    VllmDecodeWorker:
+      componentType: worker
+      subComponentType: decode
+      envFromSecret: hf-token-secret
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /opt/models
+      sharedMemory:
+        size: 80Gi
+      extraPodSpec:
+        affinity:
+          podAffinity:
+            preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              podAffinityTerm:
+                labelSelector:
+                  matchExpressions:
+                  - key: nvidia.com/dynamo-component-type
+                    operator: In
+                    values:
+                    - worker
+                topologyKey: kubernetes.io/hostname
+        mainContainer:
+          env:
+            - name: SERVED_MODEL_NAME
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: MODEL_PATH
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: HF_HOME
+              value: /opt/models
+          args:
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          command:
+          - /bin/sh
+          - -c
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/examples/backends/vllm
+        # Frontend sidecar: receives requests from kGateway on port 8000
+        # and routes them to the vLLM worker in the same pod
+        containers:
+          - name: frontend
+            image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+            command:
+              - python3
+            args:
+              - -m
+              - dynamo.frontend
+              - --router-mode
+              - direct
+            ports:
+              - containerPort: 8000
+                name: http
+                protocol: TCP
+            envFrom:
+              - secretRef:
+                  name: hf-token-secret
+            env:
+              - name: DYNAMO_PORT
+                value: "8000"
+              - name: DYN_HTTP_PORT
+                value: "8000"
+              - name: DYN_NAMESPACE
+                value: a-epp-vllm-disagg
+              - name: DYN_COMPONENT
+                value: frontend
+              - name: DYN_DISCOVERY_BACKEND
+                value: kubernetes
+              - name: DYN_PARENT_DGD_K8S_NAME
+                value: llama3-70b-disagg
+              - name: DYN_PARENT_DGD_K8S_NAMESPACE
+                value: a-epp
+              - name: POD_NAME
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.name
+              - name: POD_NAMESPACE
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.namespace
+              - name: POD_UID
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.uid
+            livenessProbe:
+              httpGet:
+                path: /live
+                port: http
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            readinessProbe:
+              httpGet:
+                path: /health
+                port: http
+              initialDelaySeconds: 10
+              periodSeconds: 10
+      replicas: 1
+      resources:
+        limits:
+          gpu: "4"
+        requests:
+          gpu: "4"
\ No newline at end of file
--- a/recipes/llama-3-70b/vllm/disagg-single-node/gaie/http-route.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-single-node/gaie/http-route.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# NOTE: You can remove metadata.namespace if using kubectl apply -n
+# The backendRefs.namespace field should match where your InferencePool is deployed
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llama3-70b-disagg-route
+spec:
+  hostnames:
+    - llama3-70b-disagg.example.com
+  parentRefs:
+    - group: gateway.networking.k8s.io
+      kind: Gateway
+      name: inference-gateway
+      namespace: kgateway-system
+  rules:
+    - backendRefs:
+        - group: inference.networking.k8s.io
+          kind: InferencePool
+          name: llama3-70b-disagg-pool
+          port: 8000
+          weight: 1
+      matches:
+        - path:
+            type: PathPrefix
+            value: /
+      timeouts:
+        request: 300s