feat: add epp component (#5611)

Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>

feat: add epp component (#5611)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
9e2a2cc9 · Julien Mancuso · GitHub · 6271a31f · 9e2a2cc9 · 9e2a2cc9
Unverified Commit 9e2a2cc9 authored Jan 29, 2026 by Julien Mancuso Committed by GitHub Jan 29, 2026
20 changed files
--- a/deploy/helm/charts/crds/templates/nvidia.com_dynamocomponentdeployments.yaml
+++ b/deploy/helm/charts/crds/templates/nvidia.com_dynamocomponentdeployments.yaml
--- a/deploy/helm/charts/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml
@@ -210,7 +210,7 @@ spec:
                            Claims lists the names of resources, defined in spec.resourceClaims,
                            that are used by this container.
-                            This is an alpha field and requires enabling the
+                            This field depends on the
                            DynamicResourceAllocation feature gate.
                            This field is immutable. It can only be set for containers.

--- a/deploy/helm/charts/crds/templates/nvidia.com_dynamographdeployments.yaml
+++ b/deploy/helm/charts/crds/templates/nvidia.com_dynamographdeployments.yaml
--- a/deploy/helm/charts/platform/README.md
+++ b/deploy/helm/charts/platform/README.md
@@ -19,7 +19,7 @@ limitations under the License.
 A Helm chart for NVIDIA Dynamo Platform.
-![Version: 0.7.1](https://img.shields.io/badge/Version-0.7.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
+![Version: 0.8.0](https://img.shields.io/badge/Version-0.8.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
 ## 🚀 Overview
@@ -86,7 +86,7 @@ The chart includes built-in validation to prevent all operator conflicts:
 | Repository | Name | Version |
 |------------|------|---------|
-| file://components/operator | dynamo-operator | 0.6.1 |
+| file://components/operator | dynamo-operator | 0.7.1 |
 | https://charts.bitnami.com/bitnami | etcd | 12.0.18 |
 | https://nats-io.github.io/k8s/helm/charts/ | nats | 1.3.2 |
 | oci://ghcr.io/nvidia/grove | grove(grove-charts) | v0.1.0-alpha.3 |
@@ -99,6 +99,7 @@ The chart includes built-in validation to prevent all operator conflicts:
 | dynamo-operator.enabled | bool | `true` | Whether to enable the Dynamo Kubernetes operator deployment |
 | dynamo-operator.natsAddr | string | `""` | NATS server address for operator communication (leave empty to use the bundled NATS chart). Format: "nats://hostname:port" |
 | dynamo-operator.etcdAddr | string | `""` | etcd server address for operator state storage (leave empty to use the bundled etcd chart). Format: "http://hostname:port" or "https://hostname:port" |
+| dynamo-operator.nats.enabled | bool | `true` | Whether the NATS is enabled |
 | dynamo-operator.modelExpressURL | string | `""` | URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true). |
 | dynamo-operator.namespaceRestriction | object | `{"enabled":false,"lease":{"duration":"30s","renewInterval":"10s"},"targetNamespace":null}` | Namespace access controls for the operator |
 | dynamo-operator.namespaceRestriction.enabled | bool | `false` | Whether to restrict operator to specific namespaces. By default, the operator will run with cluster-wide permissions. Only 1 instance of the operator should be deployed in the cluster. If you want to deploy multiple operator instances, you can set this to true and specify the target namespace (by default, the target namespace is the helm release namespace). |
@@ -176,7 +177,7 @@ For detailed etcd configuration options beyond `etcd.enabled`, please refer to t
 ## 📚 Additional Resources
- [Dynamo Kubernetes Platform Deployment Installation Guide](../../../../docs/kubernetes/installation_guide.md)
+- [Dynamo Cloud Deployment Installation Guide](../../../../docs/kubernetes/installation_guide.md)
 - [NATS Documentation](https://docs.nats.io/)
 - [etcd Documentation](https://etcd.io/docs/)
 - [Kubernetes Operator Pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/)

--- a/deploy/helm/charts/platform/components/operator/templates/deployment.yaml
+++ b/deploy/helm/charts/platform/components/operator/templates/deployment.yaml
@@ -131,6 +131,7 @@ spec:
        {{- if not .Values.namespaceRestriction.enabled }}
          - --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-dgdr-profiling
          - --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
+          - --epp-cluster-role-name={{ include "dynamo-operator.fullname" . }}-epp
        {{- end }}
        {{- if .Values.discoveryBackend }}
          - --discovery-backend={{ .Values.discoveryBackend }}

--- a/deploy/helm/charts/platform/components/operator/templates/epp.yaml
+++ b/deploy/helm/charts/platform/components/operator/templates/epp.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+{{- if .Values.namespaceRestriction.enabled }}
+# Namespace-restricted mode: Role + ServiceAccount + RoleBinding
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: epp-serviceaccount
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "dynamo-operator.labels" . | nindent 4 }}
+{{- if .Values.dynamo.dockerRegistry.useKubernetesSecret }}
+imagePullSecrets:
+- name: {{ include "dynamo-operator.componentsDockerRegistrySecretName" . }}
+{{- end }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: epp-role
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "dynamo-operator.labels" . | nindent 4 }}
+rules:
+# Gateway API inference resources
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencepools", "inferenceobjectives", "inferencemodelrewrites"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["inference.networking.k8s.io"]
+  resources: ["inferencepools"]
+  verbs: ["get", "watch", "list"]
+# Core resources for pod discovery
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+# Dynamo k8s service discovery - endpointslices
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "list", "watch"]
+# Dynamo k8s service discovery - worker metadata CRs
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamoworkermetadatas"]
+  verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: epp-binding
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "dynamo-operator.labels" . | nindent 4 }}
+subjects:
+- kind: ServiceAccount
+  name: epp-serviceaccount
+  namespace: {{ .Release.Namespace }}
+roleRef:
+  kind: Role
+  name: epp-role
+  apiGroup: rbac.authorization.k8s.io
+{{- else }}
+# Cluster-wide mode: ClusterRole for EPP
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "dynamo-operator.fullname" . }}-epp
+  labels:
+    {{- include "dynamo-operator.labels" . | nindent 4 }}
+rules:
+# Gateway API inference resources
+- apiGroups: ["inference.networking.x-k8s.io"]
+  resources: ["inferencepools", "inferenceobjectives", "inferencemodelrewrites"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["inference.networking.k8s.io"]
+  resources: ["inferencepools"]
+  verbs: ["get", "watch", "list"]
+# Core resources for pod discovery
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+# Dynamo k8s service discovery - endpointslices
+- apiGroups: ["discovery.k8s.io"]
+  resources: ["endpointslices"]
+  verbs: ["get", "list", "watch"]
+# Dynamo k8s service discovery - worker metadata CRs
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamoworkermetadatas"]
+  verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
+# Authentication/authorization
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
+- apiGroups:
+  - authorization.k8s.io
+  resources:
+  - subjectaccessreviews
+  verbs:
+  - create
+{{- end }}
--- a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
+++ b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
@@ -363,6 +363,18 @@ rules:
  - patch
  - update
  - watch
+- apiGroups:
+  - inference.networking.k8s.io
+  resources:
+  - inferencepools
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
  - nvidia.com
  resources:

--- a/deploy/inference-gateway/README.md
+++ b/deploy/inference-gateway/README.md
@@ -22,8 +22,8 @@ Currently, these setups are only supported with the kGateway based Inference Gat
  - [1. Install Dynamo Platform](#1-install-dynamo-platform)
  - [2. Deploy Inference Gateway](#2-deploy-inference-gateway)
  - [3. Deploy Your Model](#3-deploy-your-model)
-  - [4. Build EPP image](#4-build-epp-image)
+  - [4. Build EPP image (Optional)](#4-build-epp-image-optional)
-  - [5. Install Dynamo GAIE helm chart](#5-install-dynamo-gaie-helm-chart)
+  - [5. Deploy](#5-deploy)
  - [6. Verify Installation](#6-verify-installation)
  - [7. Usage](#7-usage)
  - [8. Deleting the installation](#8-deleting-the-installation)
@@ -49,6 +49,7 @@ First, deploy an inference gateway service. In this example, we'll install `kgat
 ```bash
 cd deploy/inference-gateway
+export NAMESPACE=my-model # You can put the inference gateway into another namespace and then adjust your http-route.yaml
 ./scripts/install_gaie_crd_kgateway.sh
 ```
 **Note**: The manifest at `config/manifests/gateway/kgateway/gateway.yaml` uses `gatewayClassName: agentgateway`, but kGateway's helm chart creates a GatewayClass named `kgateway`. The patch command in the script fixes this mismatch.
@@ -64,7 +65,7 @@ kubectl get gateway inference-gateway
 ```
-### 3. Deploy Your Model ###
+### 3. Setup secrets ###
 Follow the steps in [model deployment](../../examples/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace.
 Make sure to enable kv-routing by adding the env var in the FrontEnd.
@@ -109,7 +110,7 @@ Create a model configuration file similar to the vllm_agg_qwen.yaml for your mod
 This file demonstrates the values needed for the Vllm Agg setup in [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml)
 Take a note of the model's block size provided in the model card.
-### 4. Build EPP image
+### 4. Build EPP image (Optional)
 You can either use the provided Dynamo FrontEnd image for the EPP image or you need to build your own Dynamo EPP custom image following the steps below.
@@ -137,14 +138,44 @@ make info # Check image tag
 | `make all` | Build Dynamo lib + Docker image + load locally |
 | `make all-push` | Build Dynamo lib + Docker image + push to registry |
-### 5. Install Dynamo GAIE helm chart ###
+### 5. Deploy
-The Inference Gateway is configured through the `inference-gateway-resources.yaml` file.
+We recommend deploying Inference Gateway's Endpoint Picker as a Dynamo operator's managed component. Alternatively,
+you could deploy it as a standalone pod
-Deploy the Inference Gateway resources to your Kubernetes cluster by running the command below.
+#### 5.a. Deploy as a DGD component
 ```bash
-cd deploy/inference-gateway/
+kubectl apply -f operator-managed/examples/agg.yaml -n ${NAMESPACE}
+kubectl apply -f operator-managed/examples/http-route.yaml -n ${NAMESPACE}
+```
+Note that this assumes your gateway is installed into `NAMESPACE=my-model` (examples' default)
+If you installed it into a different namespace, you need to adjust the HttpRoute entry in http-route.yaml.
+#### 5.b. Deploy as a standalone pod
+##### 5.b.1 Deploy Your Model ###
+Follow the steps in [model deployment](../../examples/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace.
+Sample commands to deploy model:
+```bash
+cd <dynamo-source-root>
+cd examples/backends/vllm/deploy
+kubectl apply -f agg.yaml -n my-model
+```
+Take a note of or change the DYNAMO_IMAGE in the model deployment file.
+Do not forget docker registry secret if needed.
+##### 5.b.2 Install Dynamo GIE helm chart ###
+```bash
+cd deploy/inference-gateway/standalone
 # Export the Dynamo image you have used when deploying your model in Step 3.
 export DYNAMO_IMAGE=<the-dynamo-image-you-have-used-when-deploying-the-model>
@@ -179,10 +210,10 @@ You can configure the plugin by setting environment vars in your [values-dynamo-
 - Set `DYNAMO_ENFORCE_DISAGG=true` if you want to enforce every request being served in the disaggregated manner. By default it is false meaning if the the prefill worker is not available the request will be served in the aggregated manner.
 - By default the Dynamo plugin uses KV routing. You can expose `DYNAMO_USE_KV_ROUTING=false`  in your [values-dynamo-epp.yaml] if you prefer to route in the round-robin fashion.
 - If using kv-routing:
-  - Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-dynamo-epp.yaml](./values-dynamo-epp.yaml) to match your model's block size.The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures.
+  - Overwrite the `DYN_KV_BLOCK_SIZE` in your [values-dynamo-epp.yaml](./values-dynamo-epp.yaml) to match your model's block size.The `DYN_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures.
  - Set `DYNAMO_OVERLAP_SCORE_WEIGHT` to weigh how heavily the score uses token overlap (predicted KV cache hits) versus other factors (load, historical hit rate). Higher weight biases toward reusing workers with similar cached prefixes.
  - Set `DYNAMO_ROUTER_TEMPERATURE` to soften or sharpen the selection curve when combining scores. Low temperature makes the router pick the top candidate deterministically; higher temperature lets lower-scoring workers through more often (exploration).
-  - Set `DYNAMO_USE_KV_EVENTS=false` if you want to disable KV event tracking while using kv-routing
+  - Set `DYNAMO_USE_KV_EVENTS=false` if you want to disable the workers sending KV events while using kv-routing
  - See the [KV cache routing design](../../docs/router/kv_cache_routing.md) for details.
@@ -238,8 +269,7 @@ ps aux | grep "minikube tunnel" | grep -v grep # make sure minikube tunnel is no
 minikube tunnel # start the tunnel
 # in second terminal where you want to send inference requests
-GATEWAY_URL=$(kubectl get svc inference-gateway -n my-model -o jsonpath='{.spec.clusterIP}')
+GATEWAY_URL=$(kubectl get svc inference-gateway -n my-model -o jsonpath='{.spec.clusterIP}') & echo $GATEWAY_URL
-echo $GATEWAY_URL
 ```
 b. use port-forward to expose the gateway to the host

--- a/deploy/inference-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
+++ b/deploy/inference-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
@@ -231,31 +231,31 @@ var (
 )
 func loadDynamoConfig() {
-	ffiNamespace = getEnvOrDefault("DYNAMO_NAMESPACE", "vllm-agg")
+	ffiNamespace = getEnvOrDefault("DYN_NAMESPACE", "vllm-agg")
-	ffiComponent = getEnvOrDefault("DYNAMO_COMPONENT", "backend")
+	ffiComponent = "backend" // The pipeline uses backend not DYN_COMPONENT which is epp
-	ffiModel = getEnvOrDefault("DYNAMO_MODEL", "Qwen/Qwen3-0.6B")
+	ffiModel = getEnvOrDefault("DYN_MODEL", "Qwen/Qwen3-0.6B")
 	ffiWorkerID = getEnvInt64OrDefault("DYNAMO_WORKER_ID", 1)
 	ffiEnforceDisagg = getEnvBoolOrDefault("DYNAMO_ENFORCE_DISAGG", false)
 	ffiOverlapScoreWeight = getEnvFloatOrDefault("DYNAMO_OVERLAP_SCORE_WEIGHT", -1.0)
 	ffiRouterTemperature = getEnvFloatOrDefault("DYNAMO_ROUTER_TEMPERATURE", -1.0)
-	kvBlockSizeStr := os.Getenv("DYNAMO_KV_BLOCK_SIZE")
+	kvBlockSizeStr := os.Getenv("DYN_KV_BLOCK_SIZE")
 	if kvBlockSizeStr == "" {
-		panic("DYNAMO_KV_BLOCK_SIZE is required and must match the model card's kv_cache_block_size")
+		panic("DYN_KV_BLOCK_SIZE is required and must match the model card's kv_cache_block_size")
 	}
 	var tmp int64
 	if n, err := fmt.Sscanf(kvBlockSizeStr, "%d", &tmp); err != nil || n != 1 {
-		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE='%s' is not a valid integer", kvBlockSizeStr))
+		panic(fmt.Sprintf("DYN_KV_BLOCK_SIZE='%s' is not a valid integer", kvBlockSizeStr))
 	}
 	ffiKvBlockSize = uint32(tmp)
 	if ffiKvBlockSize < 16 || ffiKvBlockSize > 8192 {
-		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d outside [16,8192]", ffiKvBlockSize))
+		panic(fmt.Sprintf("DYN_KV_BLOCK_SIZE=%d outside [16,8192]", ffiKvBlockSize))
 	}
 	if (ffiKvBlockSize & (ffiKvBlockSize - 1)) != 0 {
-		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d must be a power of 2", ffiKvBlockSize))
+		panic(fmt.Sprintf("DYN_KV_BLOCK_SIZE=%d must be a power of 2", ffiKvBlockSize))
 	}
-	fmt.Printf("Dynamo KV Scorer: Loaded DYNAMO_KV_BLOCK_SIZE=%d\n", ffiKvBlockSize)
+	fmt.Printf("Dynamo KV Scorer: Loaded DYN_KV_BLOCK_SIZE=%d\n", ffiKvBlockSize)
 }
 func getEnvOrDefault(key, def string) string {

--- a/deploy/inference-gateway/operator-managed/examples/agg.yaml
+++ b/deploy/inference-gateway/operator-managed/examples/agg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-agg
+spec:
+  services:
+    Epp:
+      envFromSecret: hf-token-secret
+      componentType: epp
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/frontend-image:my-tag
+          env:
+            - name: DYN_KV_BLOCK_SIZE
+              value: "16"
+            - name: DYN_MODEL
+              value: "Qwen/Qwen3-0.6B"  # Match your model
+            - name: DYN_DISCOVERY_TIMEOUT
+              value: "300"
+      eppConfig:
+        # This configuration uses Dynamo's KV-aware scorer for intelligent routing
+        config:
+          # Plugins define the behavior of EPP
+          plugins:
+            # Required: tells EPP which profile to use (even if you only have one)
+            - type: single-profile-handler
+            # Picker: chooses the final endpoint after scoring
+            - name: picker
+              type: max-score-picker
+            - name: dyn-kv
+              type: kv-aware-scorer
+          # Scheduling profiles configure which plugins are used and their weights
+          schedulingProfiles:
+            - name: default
+              plugins:
+                - pluginRef: dyn-kv
+                  weight: 1
+                - pluginRef: picker
+    Frontend:
+      envFromSecret: hf-token-secret
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          env:
+            - name: DYN_LOG
+              value: "debug,dynamo_llm::kv_router=trace"
+            - name: DYN_STORE_KV
+              value: "mem"
+            - name: DYN_ROUTER_MODE
+              value: "kv"
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/examples/backends/vllm
+          command:
+            - python3
+            - -m
+            - dynamo.vllm
+          args:
+            - --model
+            - Qwen/Qwen3-0.6B
+          env:
+            - name: DYN_STORE_KV
+              value: "mem"
--- a/deploy/inference-gateway/operator-managed/examples/http-route.yaml
+++ b/deploy/inference-gateway/operator-managed/examples/http-route.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: vllm-agg-route
+spec:
+  parentRefs:
+    - group: gateway.networking.k8s.io
+      kind: Gateway
+      name: inference-gateway
+      # Note: This assumes your gateway is installed into the same namespace as this HTTPRoute.
+      # If you installed it into a different namespace, add: namespace: <your-gateway-namespace>
+  rules:
+    - backendRefs:
+        - group: inference.networking.k8s.io
+          kind: InferencePool
+          name: vllm-agg-pool
+          port: 8000
+          weight: 1
+      matches:
+        - path:
+            type: PathPrefix
+            value: /
+      timeouts:
+        request: 300s
\ No newline at end of file
--- a/deploy/inference-gateway/scripts/install_gaie_crd_kgateway.sh
+++ b/deploy/inference-gateway/scripts/install_gaie_crd_kgateway.sh
@@ -18,6 +18,11 @@
 set -euo pipefail
 trap 'echo "Error at line $LINENO. Exiting."' ERR
+# Namespace where the inference-gateway will be deployed
+# Defaults to 'default' if NAMESPACE env var is not set
+NAMESPACE=${NAMESPACE:-default}
+echo "Installing inference-gateway into namespace: $NAMESPACE"
 # Install the Gateway API
 GATEWAY_API_VERSION=v1.4.1
 kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/$GATEWAY_API_VERSION/standard-install.yaml
@@ -37,7 +42,7 @@ helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway \
  oci://cr.kgateway.dev/kgateway-dev/charts/kgateway \
  --set inferenceExtension.enabled=true
-kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/gateway/kgateway/gateway.yaml
+kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_LATEST_RELEASE}/config/manifests/gateway/kgateway/gateway.yaml -n "$NAMESPACE"
-kubectl patch gateway inference-gateway --type='json' \
+kubectl patch gateway inference-gateway -n "$NAMESPACE" --type='json' \
  -p='[{"op": "replace", "path": "/spec/gatewayClassName", "value": "kgateway"}]'
--- a/deploy/inference-gateway/helm/dynamo-gaie/.helmignore
+++ b/deploy/inference-gateway/helm/dynamo-gaie/.helmignore
--- a/deploy/inference-gateway/helm/dynamo-gaie/Chart.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/Chart.yaml
--- a/deploy/inference-gateway/helm/dynamo-gaie/epp-config-dynamo.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/epp-config-dynamo.yaml
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/NOTES.txt
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/NOTES.txt
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/_helpers.tpl
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/_helpers.tpl
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role-binding.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role-binding.yaml
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role.yaml
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
@@ -114,9 +114,9 @@ spec:
          {{- end }}
          - name: NATS_SERVER
            value: "nats://{{ $platformName }}-nats.{{ $platformNs }}:4222"
-          - name: DYNAMO_NAMESPACE
+          - name: DYN_NAMESPACE
            value: "{{ $ns }}"
-          - name: DYNAMO_KV_BLOCK_SIZE
+          - name: DYN_KV_BLOCK_SIZE
            value: "{{ $kv }}"
          - name: USE_STREAMING
            value: "true"