fix: Dyn-1729 enable etcd-less GAIE deployment in the helm chart (#5432)

Signed-off-by: Anna Tchernych <atchernych@nvidia.com>

fix: Dyn-1729 enable etcd-less GAIE deployment in the helm chart (#5432)
Signed-off-by: Anna Tchernych <atchernych@nvidia.com>
cf57c766 · atchernych · GitHub · 0d597e7c · cf57c766 · cf57c766
Unverified Commit cf57c766 authored Jan 14, 2026 by atchernych Committed by GitHub Jan 14, 2026
5 changed files
--- a/deploy/inference-gateway/README.md
+++ b/deploy/inference-gateway/README.md
@@ -8,7 +8,7 @@ When integrating Dynamo with the Inference Gateway you could either use the defa

 The setup provided here uses the Dynamo custom EPP by default. Set `epp.useDynamo=false` in your deployment to pick the approach 2.

-EPP’s default kv-routing approach is token-aware only `by approximation` because the prompt is tokenized with a generic tokenizer unaware of the model deployed. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).
+EPP’s default kv-routing approach is not token-aware because the prompt is hashed without tokenization. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).

 Currently, these setups are only supported with the kGateway based Inference Gateway.

@@ -32,29 +32,12 @@ Currently, these setups are only supported with the kGateway based Inference Gat
 ### 2. Deploy Inference Gateway ###

 First, deploy an inference gateway service. In this example, we'll install `kgateway` based gateway implementation.
-You can use the script below or follow the steps manually.
-
-Script:

 ```bash
 ./install_gaie_crd_kgateway.sh
 ```

-Manual steps:
-
-a. Deploy the Gateway API CRDs:
-
-```bash
-GATEWAY_API_VERSION=v1.3.0
-kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/$GATEWAY_API_VERSION/standard-install.yaml
-```
-
-b. Install the Inference Extension CRDs (Inference Model and Inference Pool CRDs)
-
-```bash
-INFERENCE_EXTENSION_VERSION=v0.5.1
-kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$INFERENCE_EXTENSION_VERSION/manifests.yaml
-```
+Verify installation:

 ```bash
 kubectl get gateway inference-gateway -n my-model
@@ -119,6 +102,12 @@ export EPP_IMAGE=<the-epp-image-you-built>
 helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set-string extension.image=$EPP_IMAGE
 ```

+By default, the Kubernetes discovery mechanism is used. If you prefer etcd, please use the `--set epp.dynamo.useEtcd=true` flag below.
+
+```bash
+helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set-string extension.image=$EPP_IMAGE --set epp.dynamo.useEtcd=true
+```
+
 Key configurations include:

 - An InferenceModel resource for the Qwen model
@@ -216,10 +205,6 @@ The Inference Gateway provides HTTP endpoints for model inference.

 #### 1: Populate gateway URL for your k8s cluster ####

-```bash
-export GATEWAY_URL=<Gateway-URL>
-```
-
 To test the gateway in minikube, use the following command:
 a. User minikube tunnel to expose the gateway to the host
   This requires `sudo` access to the host machine. alternatively, you can use port-forward to expose the gateway to the host as shown in alternative (b).
@@ -230,7 +215,7 @@ ps aux | grep "minikube tunnel" | grep -v grep # make sure minikube tunnel is no
 minikube tunnel & # start the tunnel

 # in second terminal where you want to send inference requests
-GATEWAY_URL=$(kubectl get svc inference-gateway -n my-model -o yaml -o jsonpath='{.spec.clusterIP}')
+GATEWAY_URL=$(kubectl get svc inference-gateway -n my-model -o jsonpath='{.spec.clusterIP}')
 echo $GATEWAY_URL
 ```


--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role.yaml
@@ -17,15 +17,26 @@ apiVersion: rbac.authorization.k8s.io/v1
 metadata:
  name: pod-read
 rules:
+# Gateway API inference resources
 - apiGroups: ["inference.networking.x-k8s.io"]
  resources: ["inferencepools"]
  verbs: ["get", "watch", "list"]
 - apiGroups: ["inference.networking.x-k8s.io"]
  resources: ["inferencemodels"]
  verbs: ["get", "watch", "list"]
+# Core resources for pod discovery
 - apiGroups: [""]
  resources: ["pods"]
  verbs: ["get", "watch", "list"]
+# Dynamo k8s service discovery - endpointslices
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "list", "watch"]
+# Dynamo k8s service discovery - worker metadata CRs
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamoworkermetadatas"]
+  verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
+# Authentication/authorization
 - apiGroups:
  - authentication.k8s.io
  resources:
@@ -37,4 +48,4 @@ rules:
  resources:
  - subjectaccessreviews
  verbs:
-  - create
\ No newline at end of file
+  - create
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
@@ -19,6 +19,7 @@
 {{- $resolvedDynNs := (include "dynamo-gaie.dynamoNamespace" .) | trim -}}
 {{- $ns           := ternary (required "set dynamoGraphDeploymentName when epp.useDynamo=true" $resolvedDynNs) "" $useDynamo -}}
 {{- $kv           := default "16" .Values.epp.dynamo.kvBlockSize -}}
+{{- $useEtcd      := default false .Values.epp.dynamo.useEtcd -}}
 {{- $std          := .Values.extension.standardImage -}}
 {{- $dyn          := .Values.extension.dynamoImage -}}
 {{- $fallback     := ternary $dyn $std .Values.epp.useDynamo -}}
@@ -87,8 +88,25 @@ spec:

        env:
        {{- if $useDynamo }}
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_UID
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.uid
+          {{- if $useEtcd }}
          - name: ETCD_ENDPOINTS
            value: "{{ $platformName }}-etcd.{{ $platformNs }}:2379"
+          {{- else }}
+          - name: DYN_DISCOVERY_BACKEND
+            value: "kubernetes"
+          {{- end }}
          - name: NATS_SERVER
            value: "nats://{{ $platformName }}-nats.{{ $platformNs }}:4222"
          - name: DYNAMO_NAMESPACE

--- a/deploy/inference-gateway/helm/dynamo-gaie/values.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/values.yaml
@@ -73,6 +73,9 @@ epp:
  configFile: "/etc/epp/epp-config-dynamo.yaml"
  dynamo:
    kvBlockSize: "16"
+    # Use ETCD for discovery instead of Kubernetes (default: false)
+    # Set to true via --set epp.dynamo.useEtcd=true to enable ETCD discovery
+    useEtcd: false

 # Platform configuration (for Dynamo mode)
 platformReleaseName: dynamo-platform

--- a/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/rbac/cluster-role.yaml
+++ b/recipes/llama-3-70b/vllm/agg/gaie/k8s-manifests/rbac/cluster-role.yaml
@@ -28,10 +28,6 @@ rules:
 - apiGroups: [""]
  resources: ["pods"]
  verbs: ["get", "watch", "list"]
-# Dynamo k8s service discovery - endpoints
- apiGroups: [""]
-  resources: ["endpoints"]
-  verbs: ["get", "list", "watch"]
 # Dynamo k8s service discovery - endpointslices
 - apiGroups: ["discovery.k8s.io"]
  resources: ["endpointslices"]