feat: Add EPP startup probe and adjust recipe - fixes [DEP-749] (#5770)

Signed-off-by: Julien Mancuso <jmancuso@nvidia.com> Signed-off-by: Anna Tchernych <atchernych@nvidia.com> Co-authored-by: Julien Mancuso <jmancuso@nvidia.com>

feat: Add EPP startup probe and adjust recipe - fixes [DEP-749] (#5770)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com> Signed-off-by: Anna Tchernych <atchernych@nvidia.com> Co-authored-by: Julien Mancuso <jmancuso@nvidia.com>
2d602853 · atchernych · GitHub · 07d57894 · 2d602853 · 07d57894
Unverified Commit 2d602853 authored Feb 03, 2026 by atchernych Committed by GitHub Feb 03, 2026
15 changed files
--- a/deploy/inference-gateway/README.md
+++ b/deploy/inference-gateway/README.md
@@ -146,6 +146,18 @@ kubectl apply -f operator-managed/examples/agg.yaml -n ${NAMESPACE}
 kubectl apply -f operator-managed/examples/http-route.yaml -n ${NAMESPACE}
 ```

+**Startup Probe Timeout:** The EPP has a default startup probe timeout of 30 minutes (10s × 180 failures).
+If your model takes longer to load, increase the `failureThreshold` in the EPP's `startupProbe`. For example,
+to allow 60 minutes for startup:
+
+```yaml
+extraPodSpec:
+  mainContainer:
+    startupProbe:
+      failureThreshold: 360  # 10s × 360 = 60 minutes
+```
+
+**Gateway Namespace**
 Note that this assumes your gateway is installed into `NAMESPACE=my-model` (examples' default)
 If you installed it into a different namespace, you need to adjust the HttpRoute entry in http-route.yaml.


--- a/deploy/inference-gateway/operator-managed/examples/http-route.yaml
+++ b/deploy/inference-gateway/operator-managed/examples/http-route.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: gateway.networking.k8s.io/v1
-kind: HTTPRoute
-metadata:
-  name: vllm-agg-route
-spec:
-  parentRefs:
-    - group: gateway.networking.k8s.io
-      kind: Gateway
-      name: inference-gateway
-      # Note: This assumes your gateway is installed into the same namespace as this HTTPRoute.
-      # If you installed it into a different namespace, add: namespace: <your-gateway-namespace>
-  rules:
-    - backendRefs:
-        - group: inference.networking.k8s.io
-          kind: InferencePool
-          name: vllm-agg-pool
-          port: 8000
-          weight: 1
-      matches:
-        - path:
-            type: PathPrefix
-            value: /
-      timeouts:
-        request: 300s
\ No newline at end of file
--- a/deploy/operator/internal/dynamo/component_epp.go
+++ b/deploy/operator/internal/dynamo/component_epp.go
@@ -71,17 +71,26 @@ func (e *EPPDefaults) GetBaseContainer(context ComponentContext) (corev1.Contain
 		PeriodSeconds:       10,
 	}

+	// Startup probe allows long initialization while waiting for workers to register.
+	// EPP waits indefinitely for discovery to find workers, so this probe is the
+	// only timeout mechanism. Default: 30 minutes (10s × 180 = 1800s).
+	container.StartupProbe = &corev1.Probe{
+		ProbeHandler: corev1.ProbeHandler{
+			GRPC: &corev1.GRPCAction{
+				Port:    9003,
+				Service: ptr.To("inference-extension"),
+			},
+		},
+		PeriodSeconds:    10,
+		FailureThreshold: 180,
+	}
+
 	// EPP-specific environment variables
 	container.Env = append(container.Env, []corev1.EnvVar{
 		{
 			Name:  "DYN_KV_BLOCK_SIZE",
 			Value: "16",
 		},
-		{
-			// DYN_DISCOVERY_TIMEOUT_SEC is how long to wait for workers to register (in seconds)
-			Name:  "DYN_DISCOVERY_TIMEOUT_SEC",
-			Value: "300",
-		},
 		{
 			Name:  "USE_STREAMING",
 			Value: "true",

--- a/lib/bindings/c/src/lib.rs
+++ b/lib/bindings/c/src/lib.rs
@@ -57,47 +57,30 @@ pub enum DynamoLlmResult {
    ERR = 1,
 }

-/// Default timeout for discovery sync (seconds).
-const DEFAULT_DISCOVERY_TIMEOUT_SEC: u64 = 10;
-
-/// Get discovery timeout from environment variable or use default.
-/// Reads DYN_DISCOVERY_TIMEOUT_SEC env var (in seconds).
-fn get_discovery_timeout_secs() -> u64 {
-    std::env::var("DYN_DISCOVERY_TIMEOUT_SEC")
-        .ok()
-        .and_then(|s| s.parse::<u64>().ok())
-        .unwrap_or(DEFAULT_DISCOVERY_TIMEOUT_SEC)
-}
-
-/// Wait for the discovery daemon to sync and return at least one instance.
-/// This ensures list() calls will have data available.
-/// Returns the number of instances found, or 0 if timed out.
-async fn wait_for_discovery_sync(drt: &DistributedRuntime, timeout_secs: u64) -> usize {
-    tracing::info!("Waiting for discovery to sync...");
+// Wait for the discovery daemon to sync indefinitely and return at least one instance.
+// This is because the Model info is registered by workers and it may take up to 30 min for the model weights to load and for the worker to register itself.
+// The waiting timeout is implemented in the Kubernetes StartupProbe. The EPP waiting loops runs indefinitely, the Probe is a single source of truth with when to kill the EPP if discovery fails.
+// If workers are not found within the probe's failureThreshold × periodSeconds, the pod will be killed and restarted.
+// Users can adjust the StartupProbe waiting timed in the DGD for large models.
+async fn wait_for_discovery_sync(drt: &DistributedRuntime) -> usize {
+    tracing::info!(
+        "Waiting for discovery to sync (no timeout - controlled by K8s StartupProbe)..."
+    );
    let discovery = drt.discovery();
-    let timeout = std::time::Duration::from_secs(timeout_secs);
-    let start = std::time::Instant::now();

    loop {
        match discovery.list(DiscoveryQuery::AllModels).await {
            Ok(instances) if !instances.is_empty() => {
-                tracing::info!(
-                    "Discovery sync complete: found {} instances",
-                    instances.len()
-                );
                return instances.len();
            }
            Ok(_) => {
-                if start.elapsed() > timeout {
-                    tracing::warn!("Discovery sync timed out waiting for instances");
-                    return 0;
-                }
                tracing::debug!("No instances yet, waiting...");
                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
            }
            Err(e) => {
-                tracing::warn!("Discovery list error: {}, continuing...", e);
-                return 0;
+                // Log and continue - transient errors shouldn't stop the wait
+                tracing::warn!("Discovery list error: {}, retrying...", e);
+                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
            }
        }
    }
@@ -128,18 +111,12 @@ pub unsafe extern "C" fn dynamo_llm_init(
            .await
        {
            Ok(drt) => {
-                // Wait for discovery to sync before returning
+                // Wait for discovery to sync before returning.
                // This is needed because dynamo_create_worker_selection_pipeline() is called
-                // immediately after, and it needs discovery.list() to return data
-                // the discovery daemon takes time to query K8s and returns async, so we need to wait.
-                let timeout_secs = get_discovery_timeout_secs();
-                let instance_count = wait_for_discovery_sync(drt, timeout_secs).await;
-                if instance_count == 0 {
-                    tracing::error!(
-                        "Discovery sync failed: no worker instances found. Is the backend running?"
-                    );
-                    return Err(DynamoLlmResult::ERR);
-                }
+                // immediately after, and it needs discovery.list() to return data.
+                // The discovery daemon takes time to query K8s and returns async, so we need to wait.
+                // Note: This waits indefinitely - the K8s StartupProbe is the timeout mechanism.
+                wait_for_discovery_sync(drt).await;
                Ok(())
            }
            Err(e) => {
@@ -1374,14 +1351,9 @@ pub async fn create_worker_selection_pipeline_chat(

    // Only wait for discovery sync if we just initialized the DRT
    // (dynamo_llm_init already does this when it initializes)
+    // Note: This waits indefinitely - the K8s StartupProbe is the timeout mechanism.
    if needs_sync {
-        let timeout_secs = get_discovery_timeout_secs();
-        let instance_count = wait_for_discovery_sync(distributed_runtime, timeout_secs).await;
-        if instance_count == 0 {
-            return Err(anyhow::anyhow!(
-                "Discovery sync failed: no worker instances found. Is the backend running?"
-            ));
-        }
+        wait_for_discovery_sync(distributed_runtime).await;
    }

    let component = distributed_runtime

--- a/recipes/llama-3-70b/README.md
+++ b/recipes/llama-3-70b/README.md
@@ -64,4 +64,4 @@ curl http://localhost:8000/v1/chat/completions \

 - Update `storageClassName` in `model-cache/model-cache.yaml` to match your cluster before deploying
 - Model download takes approximately 15-30 minutes depending on network speed
- For GAIE (Gateway API Inference Extension) integration, see [vllm/agg/gaie/](vllm/agg/gaie/)
+- For GAIE (Gateway API Inference Extension) integration, `kubectl apply` the files from the corresponding subfolder i.e. [vllm/agg/gaie/](vllm/agg/gaie/)
--- a/deploy/inference-gateway/operator-managed/examples/agg.yaml
+++ b/deploy/inference-gateway/operator-managed/examples/agg.yaml
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: vllm-agg
+  name: llama3-70b-agg
 spec:
+  backendFramework: vllm
+  pvcs:
+    - name: model-cache
+      create: false
  services:
    Epp:
      envFromSecret: hf-token-secret
@@ -13,14 +16,12 @@ spec:
      replicas: 1
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/frontend-image:my-tag
+          image: nvcr.io/nvidia/ai-dynamo/frontend:0.8.0
          env:
            - name: DYN_KV_BLOCK_SIZE
-              value: "16"
+              value: "128"
            - name: DYN_MODEL
-              value: "Qwen/Qwen3-0.6B"  # Match your model
-            - name: DYN_DISCOVERY_TIMEOUT
-              value: "300"
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
      eppConfig:
        # This configuration uses Dynamo's KV-aware scorer for intelligent routing
        config:
@@ -43,35 +44,46 @@ spec:
    Frontend:
      envFromSecret: hf-token-secret
      componentType: frontend
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /opt/models
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
+          workingDir: /workspace/examples/backends/vllm
+      envs:
+        - name: HF_HOME
+          value: /opt/models
      replicas: 1
+    VllmPrefillWorker:
+      componentType: worker
+      envFromSecret: hf-token-secret
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /opt/models
+      sharedMemory:
+        size: 20Gi
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
          env:
-            - name: DYN_LOG
-              value: "debug,dynamo_llm::kv_router=trace"
+            - name: SERVED_MODEL_NAME
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: MODEL_PATH
+              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
+            - name: HF_HOME
+              value: /opt/models
            - name: DYN_STORE_KV
              value: "mem"
-            - name: DYN_ROUTER_MODE
-              value: "kv"
-    VllmDecodeWorker:
-      envFromSecret: hf-token-secret
-      componentType: worker
+          args:
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          command:
+          - /bin/sh
+          - -c
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
+          workingDir: /workspace/examples/backends/vllm
      replicas: 1
      resources:
        limits:
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
-          workingDir: /workspace/examples/backends/vllm
-          command:
-            - python3
-            - -m
-            - dynamo.vllm
-          args:
-            - --model
-            - Qwen/Qwen3-0.6B
-          env:
-            - name: DYN_STORE_KV
-              value: "mem"
+          gpu: "4"
+        requests:
+          gpu: "4"
--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/http-route.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/http-route.yaml
@@ -27,7 +27,7 @@ spec:
      namespace: kgateway-system
  rules:
    - backendRefs:
-        - group: inference.networking.x-k8s.io
+        - group: inference.networking.k8s.io
          kind: InferencePool
          name: llama3-70b-agg-pool
          port: 8000

--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/configmap.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/configmap.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: You can remove the namespace field if using kubectl apply -n
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: epp-config
-  labels:
-    app.kubernetes.io/name: dynamo-gaie
-data:
-  epp-config-dynamo.yaml: |
-    apiVersion: inference.networking.x-k8s.io/v1alpha1
-    kind: EndpointPickerConfig
-    plugins:
-      # Required: tells EPP which profile to use (even if you only have one)
-      - type: single-profile-handler
-
-      # Picker: chooses the final endpoint after scoring
-      - name: picker
-        type: max-score-picker
-      - name: dyn-pre
-        type: dynamo-inject-workerid
-        parameters: {}
-      - name: dyn-kv
-        type: kv-aware-scorer
-      - name: dyn-cleanup
-        type: dynamo-cleanup
-    schedulingProfiles:
-      - name: default
-        plugins:
-          - pluginRef: dyn-kv
-            weight: 1
-          - pluginRef: picker
--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/deployment.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/deployment.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: Update the namespace field below to match your deployment namespace
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llama3-70b-agg-epp
-  labels:
-    app: llama3-70b-agg-epp
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: llama3-70b-agg-epp
-  template:
-    metadata:
-      labels:
-        app: llama3-70b-agg-epp
-    spec:
-      serviceAccountName: epp-sa
-      terminationGracePeriodSeconds: 130
-
-      imagePullSecrets:
-        - name: docker-imagepullsecret
-
-      containers:
-        - name: epp
-          image: nvcr.io/nvidia/ai-dynamo/frontend:0.8.0
-          imagePullPolicy: IfNotPresent
-          resources:
-            requests:
-              memory: "1Gi"
-              cpu: "1"
-            limits:
-              memory: "2Gi"
-              cpu: "2"
-          command: ["/bin/sh", "-c"]
-          args:
-            - >
-              exec /epp
-              -poolName "llama3-70b-agg-pool"
-              -poolNamespace "$POD_NAMESPACE"
-              -v 4 --zap-encoder json
-              -grpcPort 9002 -grpcHealthPort 9003
-              -configFile /etc/epp/epp-config-dynamo.yaml
-
-          volumeMounts:
-            - name: epp-config
-              mountPath: /etc/epp
-              readOnly: true
-
-          env:
-            - name: POD_NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-            - name: POD_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_UID
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.uid
-            - name: PLATFORM_NAMESPACE
-              value: "$(POD_NAMESPACE)" # set to your dynamo platform namespace if different
-            # if you want to use etcd enable this and remove the DYN_DISCOVERY_BACKEND env var
-            # - name: ETCD_ENDPOINTS
-            #   value: "dynamo-platform-etcd.$(PLATFORM_NAMESPACE):2379" #  update dynamo-platform to appropriate namespace
-            - name: NATS_SERVER
-              value: "nats://dynamo-platform-nats.$(PLATFORM_NAMESPACE):4222" #  update dynamo-platform to appropriate namespace
-            - name: DYNAMO_NAMESPACE
-              value: "$(POD_NAMESPACE)-llama3-70b-agg"
-            - name: DYN_MODEL
-              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
-            - name: DYN_KV_BLOCK_SIZE
-              value: "128" # UPDATE to match the --block-size in your deploy.yaml engine command
-            - name: USE_STREAMING
-              value: "true"
-            - name: DYN_ENFORCE_DISAGG
-              value: "false"
-            - name: DYN_DISCOVERY_BACKEND
-              value: "kubernetes"
-
-          ports:
-            - containerPort: 9002
-            - containerPort: 9003
-            - name: metrics
-              containerPort: 9090
-          livenessProbe:
-            grpc:
-              port: 9003
-              service: inference-extension
-            initialDelaySeconds: 5
-            periodSeconds: 10
-          readinessProbe:
-            grpc:
-              port: 9003
-              service: inference-extension
-            initialDelaySeconds: 5
-            periodSeconds: 10
-
-      volumes:
-        - name: epp-config
-          configMap:
-            name: epp-config
-            items:
-              - key: epp-config-dynamo.yaml
-                path: epp-config-dynamo.yaml
--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/service.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/epp/service.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: Update the namespace field below to match your deployment namespace
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama3-70b-agg-epp
-spec:
-  selector:
-    app: llama3-70b-agg-epp
-  ports:
-    - protocol: TCP
-      port: 9002
-      targetPort: 9002
-      appProtocol: http2
-  type: ClusterIP
--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/model/inference-model.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/model/inference-model.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: You can remove the namespace field if using kubectl apply -n
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  name: llama3-70b-agg-model
-spec:
-  criticality: Critical
-  modelName: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
-  poolRef:
-    group: inference.networking.x-k8s.io
-    kind: InferencePool
-    name: llama3-70b-agg-pool
--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/model/inference-pool.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/model/inference-pool.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: You can remove the namespace field if using kubectl apply -n
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferencePool
-metadata:
-  name: llama3-70b-agg-pool
-spec:
-  targetPortNumber: 8000
-  selector:
-    nvidia.com/dynamo-component-type: frontend
-  extensionRef:
-    failureMode: FailOpen
-    group: ""
-    kind: Service
-    name: llama3-70b-agg-epp
--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/rbac/cluster-role.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/rbac/cluster-role.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: pod-read
-rules:
-# Gateway API inference resources
- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencepools"]
-  verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencemodels"]
-  verbs: ["get", "watch", "list"]
-# Core resources for pod discovery
- apiGroups: [""]
-  resources: ["pods"]
-  verbs: ["get", "watch", "list"]
-# Dynamo k8s service discovery - endpointslices
- apiGroups: ["discovery.k8s.io"]
-  resources: ["endpointslices"]
-  verbs: ["get", "list", "watch"]
-# Dynamo k8s service discovery - worker metadata CRs
- apiGroups: ["nvidia.com"]
-  resources: ["dynamoworkermetadatas"]
-  verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
-# Authentication/authorization
- apiGroups:
-  - authentication.k8s.io
-  resources:
-  - tokenreviews
-  verbs:
-  - create
- apiGroups:
-  - authorization.k8s.io
-  resources:
-  - subjectaccessreviews
-  verbs:
-  - create
-
--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/rbac/role-binding.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/rbac/role-binding.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: ClusterRoleBinding is cluster-scoped (no metadata.namespace)
-# The subjects.namespace field specifies where the ServiceAccount is located
-# This CANNOT be removed - it must match your deployment namespace
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  name: pod-read-binding
-  # no metadata.namespace - kubectl -n sets it
-subjects:
-  - kind: ServiceAccount
-    name: epp-sa
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: pod-read
--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/rbac/service-account.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/rbac/service-account.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: epp-sa
-# no metadata.namespace (kubectl -n sets it)