feat: Add epp-aware gateway integration (#2345)

62978595 · atchernych · GitHub · 28546bad · 62978595 · 62978595
Unverified Commit 62978595 authored Aug 18, 2025 by atchernych Committed by GitHub Aug 18, 2025
7 changed files
--- a/deploy/inference-gateway/README.md
+++ b/deploy/inference-gateway/README.md
 ## Inference Gateway Setup with Dynamo

-This Setup treats each Dynamo deployment as a black box and routes traffic randomly among the deployments.
-Currently, this setup is only kgateway based Inference Gateway.
+This guide demonstrates two setups.
+The EPP-unaware setup treats each Dynamo deployment as a black box and routes traffic randomly among the deployments.
+The EPP-aware setup first uses Dynamo Router to pick the worker instance id for serving the model. Then traffic gets directed straight to the selected worker.
+Currently, these setups are only supported with the kGateway based Inference Gateway.

 ## Table of Contents

@@ -39,7 +41,7 @@ GATEWAY_API_VERSION=v1.3.0
 kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/$GATEWAY_API_VERSION/standard-install.yaml
 ```

-b. Install the Inference Extension CRDs (Inferenece Model and Inference Pool CRDs)
+b. Install the Inference Extension CRDs (Inference Model and Inference Pool CRDs)
 ```bash
 INFERENCE_EXTENSION_VERSION=v0.5.1
 kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$INFERENCE_EXTENSION_VERSION/manifests.yaml -n  my-model
@@ -84,13 +86,39 @@ kubectl apply -f agg.yaml -n my-model

 The Inference Gateway is configured through the `inference-gateway-resources.yaml` file.

-Deploy the Inference Gateway resources to your Kubernetes cluster:
+Deploy the Inference Gateway resources to your Kubernetes cluster by running one of the commands below.
+
+For the EPP-unaware black box integration run:

 ```bash
 cd deploy/inference-gateway
 helm install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml
 ```

+For the EPP-aware integration run:
+
+```bash
+cd deploy/inference-gateway
+
+helm install dynamo-gaie ./helm/dynamo-gaie \
+  -n my-model \
+  -f ./vllm_agg_qwen.yaml \
+  -f ./values-epp-aware.yaml
+```
+
+Or customize the EPP further using flags, i.e:
+
+```bash
+helm install dynamo-gaie ./helm/dynamo-gaie \
+  -n my-model \
+  -f ./vllm_agg_qwen.yaml \
+  --set eppAware.enabled=true \
+  --set eppAware.eppImage=docker.io/lambda108/epp-inference-extension-dynamo:1.0.0 \
+  --set imagePullSecrets='{docker-imagepullsecret}' \
+  --set-string epp.extraEnv[0].name=USE_STREAMING \
+  --set-string epp.extraEnv[0].value=true
+```
+
 Key configurations include:
 - An InferenceModel resource for the Qwen model
 - A service for the inference gateway

--- a/deploy/inference-gateway/helm/dynamo-gaie/Chart.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/Chart.yaml
@@ -30,7 +30,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 0.2.0

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
@@ -31,14 +31,29 @@ spec:
    spec:
      # Conservatively, this timeout should mirror the longest grace period of the pods within the pool
      terminationGracePeriodSeconds: 130
+
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+      {{- range .Values.imagePullSecrets }}
+        - name: {{ . | quote }}
+      {{- end }}
+      {{- end }}
+
      containers:
      - name: epp
-        image: {{ .Values.extension.image }}
-        imagePullPolicy: IfNotPresent
+        image: {{ if .Values.eppAware.enabled }}
+          {{ default .Values.extension.image .Values.eppAware.eppImage }}
+        {{ else }}
+          {{ .Values.extension.image }}
+        {{ end }}
+        imagePullPolicy: {{ .Values.epp.imagePullPolicy | default "IfNotPresent" }}
        args:
+        {{- if .Values.epp.argsOverride }}
+        {{- toYaml .Values.epp.argsOverride | nindent 8 }}
+        {{- else }}
          - -poolName
          - "{{ .Values.model.shortName }}-pool"
-        - "-poolNamespace"
+          - -poolNamespace
          - "{{ .Release.Namespace }}"
          - -v
          - "4"
@@ -48,6 +63,12 @@ spec:
          - "9002"
          - -grpcHealthPort
          - "9003"
+        {{- end }}
+        env:
+        {{- range .Values.epp.extraEnv }}
+          - name: {{ .name }}
+            value: {{ .value | quote }}
+        {{- end }}
        ports:
        - containerPort: 9002
        - containerPort: 9003
@@ -65,3 +86,25 @@ spec:
            service: inference-extension
          initialDelaySeconds: 5
          periodSeconds: 10
+
+      {{- if .Values.eppAware.enabled }}
+      - name: {{ .Values.eppAware.sidecar.name }}
+        image: {{ .Values.eppAware.sidecar.image }}
+        imagePullPolicy: {{ .Values.eppAware.sidecar.imagePullPolicy | default "IfNotPresent" }}
+        command: {{- toYaml .Values.eppAware.sidecar.command | nindent 8 }}
+        args: {{- toYaml .Values.eppAware.sidecar.args | nindent 8 }}
+        env:
+        {{- range .Values.eppAware.sidecar.env }}
+          {{- if .valueFromDynamoNamespace }}
+          - name: {{ .name }}
+            value: "{{ $.Values.dynamoNamespace }}"
+          {{- else }}
+          - name: {{ .name }}
+            value: {{ .value | quote }}
+          {{- end }}
+        {{- end }}
+        ports:
+        {{- toYaml .Values.eppAware.sidecar.ports | nindent 8 }}
+        resources:
+        {{- toYaml .Values.eppAware.sidecar.resources | nindent 10 }}
+      {{- end }}
\ No newline at end of file
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/http-router.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/http-router.yaml
@@ -18,6 +18,7 @@ apiVersion: gateway.networking.k8s.io/v1
 kind: HTTPRoute
 metadata:
  name: {{ .Values.model.shortName }}-route
+  namespace: {{ .Release.Namespace }}
 spec:
  parentRefs:
  - group: gateway.networking.k8s.io
@@ -28,6 +29,7 @@ spec:
    - group: inference.networking.x-k8s.io
      kind: InferencePool
      name: {{ .Values.model.shortName }}-pool
+      namespace: {{ .Release.Namespace }}
      port: {{ .Values.inferencePool.port }}
      weight: 1
    matches:

--- a/deploy/inference-gateway/helm/dynamo-gaie/values.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/values.yaml
@@ -49,5 +49,49 @@ httpRoute:
    request: "300s"

 extension:
-  # the GAIE extension
+  # default (non-epp-aware) EPP image for the GAIE extension
  image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.4.0
+
+# generic knobs you may want in both modes
+imagePullSecrets: []     # e.g. ["docker-imagepullsecret"]
+epp:
+  imagePullPolicy: IfNotPresent
+  # Add env in name/value pairs
+  extraEnv: []           # e.g. [{name: USE_STREAMING, value: "true"}]
+  # If you ever want to completely override args, supply a list here.
+  # When empty, chart will render sane defaults
+  argsOverride: []
+
+# epp-aware mode toggle + specific settings
+eppAware:
+  enabled: false
+  # Optional: override EPP image when epp-aware=true
+  eppImage: docker.io/lambda108/epp-inference-extension-dynamo:1.0.0
+
+  # Sidecar (frontend-router)
+  sidecar:
+    # Container name for the sidecar
+    name: frontend-router
+    # Sidecar image
+    image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1
+    # Image pull policy for the sidecar
+    imagePullPolicy: IfNotPresent
+    # Command and args for running the frontend in router mode.
+    command: ["/bin/sh", "-c"]
+    args: ["python3 -m dynamo.frontend --http-port 8000 --router-mode kv"]
+    # Environment variables for the sidecar.
+    env:
+      - name: DYNAMO_NAMESPACE
+        valueFromDynamoNamespace: true
+      - name: ETCD_ENDPOINTS
+        value: "http://dynamo-platform-etcd:2379"
+      - name: NATS_SERVER
+        value: "nats://dynamo-platform-nats:4222"
+    # Resource requests/limits for the sidecar container.
+    resources:
+      requests:
+        cpu: "1"
+        memory: "2Gi"
+    # Ports exposed by the sidecar container.
+    ports:
+      - containerPort: 8000
--- a/deploy/inference-gateway/values-epp-aware.yaml
+++ b/deploy/inference-gateway/values-epp-aware.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+eppAware:
+  enabled: true
+  eppImage: docker.io/lambda108/epp-inference-extension-dynamo:1.0.0
+
+imagePullSecrets:
+  - docker-imagepullsecret
+
+epp:
+  extraEnv:
+    - name: USE_STREAMING
+      value: "true"
--- a/docs/guides/dynamo_deploy/quickstart.md
+++ b/docs/guides/dynamo_deploy/quickstart.md
@@ -151,6 +151,7 @@ helm install dynamo-crds ./crds/ \
 ***Step 2: Build Dependencies and Install Platform**

 ```bash
+cd deploy/cloud/helm
 helm dep build ./platform/

 kubectl create namespace ${NAMESPACE}