feat: update GAIE to release version with hints in headers (#5503)

Signed-off-by: Anna Tchernych <atchernych@nvidia.com>

feat: update GAIE to release version with hints in headers (#5503)
Signed-off-by: Anna Tchernych <atchernych@nvidia.com>
4810ad34 · atchernych · GitHub · b31b5b56 · b31b5b56 · 4810ad34
Unverified Commit 4810ad34 authored Jan 22, 2026 by atchernych Committed by GitHub Jan 22, 2026
20 changed files
--- a/container/Dockerfile.epp
+++ b/container/Dockerfile.epp
-#  SPDX-FileCopyrightText:  Copyright The Kubernetes Authors.
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#  #
-#  http://www.apache.org/licenses/LICENSE-2.0
-#  #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#  Modifications Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES
-
-# Dockerfile.epp - Custom Dockerfile for GAIE EPP. This is to be used with the deploy/inference-gateway/build-epp-dynamo.sh
-
-ARG DOCKER_PROXY
-ARG BUILDER_IMAGE="golang:1.24"
-ARG BASE_IMAGE="ubuntu:22.04"
-
-############################
-# Builder
-############################
-FROM ${DOCKER_PROXY}${BUILDER_IMAGE} AS builder
-
-ENV CGO_ENABLED=1
-# be explicit; helps cgo when linking libstdc++
-ENV CC=gcc
-ENV CXX=g++
-
-# C/C++ toolchain for cgo, and libstdc++ for link-time
-RUN apt-get update && apt-get install -y --no-install-recommends \
-  build-essential \
-  gcc g++ \
-  libc6-dev \
-  ca-certificates \
-  && rm -rf /var/lib/apt/lists/*
-
-ARG COMMIT_SHA=unknown
-ARG BUILD_REF
-
-WORKDIR /src
-
-# deps first (cache)
-COPY go.mod go.sum ./
-RUN go mod download
-
-# source
-COPY cmd/epp ./cmd/epp
-COPY pkg/epp ./pkg/epp
-COPY internal ./internal
-COPY api ./api
-
-# sanity (optional)
-RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/include/ || echo "Headers not found"
-RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib/ || echo "Library not found"
-
-# build
-WORKDIR /src/cmd/epp
-RUN go build \
-  -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" \
-  -o /epp
-
-############################
-# Runtime
-############################
-FROM ${DOCKER_PROXY}${BASE_IMAGE} AS runtime
-
-ARG DYNAMO_COMMIT_SHA
-ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
-
-# Minimal runtime deps; include libstdc++ runtime for -lstdc++
-RUN apt-get update && apt-get install -y --no-install-recommends \
-  ca-certificates \
-  libstdc++6 \
-  && rm -rf /var/lib/apt/lists/* \
-  && groupadd -r nonroot && useradd -r -g nonroot -m -d /home/nonroot nonroot \
-  && mkdir -p /home/nonroot/.cache/huggingface/hub \
-  && chown -R nonroot:nonroot /home/nonroot
-
-WORKDIR /
-COPY --from=builder /epp /epp
-
-# Set HOME so ModelExpress can find the cache directory
-ENV HOME=/home/nonroot
-
-USER nonroot:nonroot
-ENTRYPOINT ["/epp"]
--- a/container/README.md
+++ b/container/README.md
@@ -199,8 +199,8 @@ The frontend image is a specialized container that includes the Dynamo component
 ```

 The build process automatically:
-1. Clones the Gateway API Inference Extension (GAIE) repository
-2. Builds the custom EPP image with Dynamo routing capabilities
+1. Builds the Dynamo static library for EPP KV-aware routing
+2. Builds the custom EPP Docker image using `make all` from `deploy/inference-gateway/epp/Makefile`
 3. Builds the frontend image with the EPP binary and Dynamo runtime components

 For more details, see [`deploy/inference-gateway/README.md`](../deploy/inference-gateway/README.md).

--- a/container/build.sh
+++ b/container/build.sh
@@ -138,10 +138,6 @@ SGLANG_CUDA_VERSION="12.9.1"
 SGLANG_CUDA_VERSION_CU13="13.0.1"
 SGLANG_RUNTIME_IMAGE_TAG_CU13="v0.5.7-cu130-runtime"

-# GAIE (Gateway API Inference Extension) configuration for frontend (required for EPP binary for frontend image)
-GAIE_REPO_URL="https://github.com/kubernetes-sigs/gateway-api-inference-extension.git"
-GAIE_VERSION="v0.5.1"
-
 PYTHON_VERSION="3.12"

 NIXL_REF=0.8.0
@@ -969,39 +965,33 @@ show_image_options
 # Handle FRONTEND target: build EPP image first
 if [[ ${TARGET^^} == "FRONTEND" ]]; then
    echo "Building FRONTEND image - requires EPP image"
-
-    # Build base dynamo image first (framework=NONE, target=dev)
    echo ""
-    echo "Building EPP image for Frontend..."
-    # Set up paths for GAIE
-    GAIE_CLONE_DIR="${BUILD_CONTEXT}/.build/external/gateway-api-inference-extension"
+    echo "Building EPP image for Frontend using Makefile..."

-    # Clone GAIE repo
-    echo ""
-    echo "Cloning GAIE repository at ${GAIE_VERSION}..."
-    $RUN_PREFIX rm -rf "${GAIE_CLONE_DIR}"
-    $RUN_PREFIX mkdir -p "$(dirname "${GAIE_CLONE_DIR}")"
-    $RUN_PREFIX git clone ${GAIE_REPO_URL} "${GAIE_CLONE_DIR}"
-    $RUN_PREFIX cd "${GAIE_CLONE_DIR}"
-    $RUN_PREFIX git checkout ${GAIE_VERSION}
-    $RUN_PREFIX cd "${BUILD_CONTEXT}"
-
-    # Build EPP image
-    echo ""
-    echo "Building EPP image..."
-    export GAIE_DIR="${GAIE_CLONE_DIR}"
-    export DYNAMO_DIR="${BUILD_CONTEXT}"
+    # EPP directory with the new self-contained build
+    EPP_DIR="${BUILD_CONTEXT}/deploy/inference-gateway/epp"

    # Set DOCKER_PROXY from ECR_HOSTNAME if available (for pulling base images through proxy)
+    # This prevents rate-limiting when building in CI across multiple PRs
+    DOCKER_PROXY_ARG=""
    if [[ -n "${ECR_HOSTNAME}" ]]; then
-        export DOCKER_PROXY="${ECR_HOSTNAME}/dockerhub/"
+        DOCKER_PROXY="${ECR_HOSTNAME}/dockerhub/"
+        DOCKER_PROXY_ARG="DOCKER_PROXY=${DOCKER_PROXY}"
        echo "Using DOCKER_PROXY: ${DOCKER_PROXY}"
    fi

-    $RUN_PREFIX bash ${DYNAMO_DIR}/deploy/inference-gateway/build-epp-dynamo.sh
-
-    # Set EPP image tag (matches what build-epp-dynamo.sh produces)
-    EPP_IMAGE_TAG="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:${GAIE_VERSION}-dirty"
+    # Build EPP image using the Makefile
+    # The Makefile handles: building Dynamo library, building Docker image, loading it locally
+    $RUN_PREFIX make -C "${EPP_DIR}" all DYNAMO_DIR="${BUILD_CONTEXT}" ${DOCKER_PROXY_ARG}
+
+    # Compute EPP image tag (must match Makefile's IMAGE_TAG)
+    # IMAGE_TAG = $(IMAGE_REPO):$(GIT_TAG)
+    # IMAGE_REPO = $(DOCKER_SERVER)/$(IMAGE_NAME)
+    # Image lives in local cache only, not pushed to any registry
+    EPP_DOCKER_SERVER="dynamo"
+    EPP_IMAGE_NAME="dynamo-epp"
+    EPP_GIT_TAG=$(git describe --tags --dirty --always 2>/dev/null || echo "dev")
+    EPP_IMAGE_TAG="${EPP_DOCKER_SERVER}/${EPP_IMAGE_NAME}:${EPP_GIT_TAG}"

    echo "Successfully built EPP image: ${EPP_IMAGE_TAG}"


--- a/deploy/inference-gateway/README.md
+++ b/deploy/inference-gateway/README.md
 ## Inference Gateway Setup with Dynamo

-When integrating Dynamo with the Inference Gateway you could either use the default EPP image provided by the extension or use the custom Dynamo image.
+When integrating Dynamo with the Inference Gateway it is recommended to use the custom Dynamo EPP image.

-1. When using the Dynamo custom EPP image you will take advantage of the Dynamo router when EPP chooses the best worker to route the request to. This setup uses a custom Dynamo plugin `dyn-kv` to pick the best worker. In this case the Dynamo routing logic is moved upstream. We recommend this approach.
+1. **Dynamo EPP (Recommended):** The custom Dynamo EPP image integrates the Dynamo router directly into the gateway's endpoint picker. Using the `dyn-kv` plugin, it selects the optimal worker based on KV cache state and tokenized prompt before routing the request. The integration moves intelligent routing upstream to the gateway layer.

-2. When using the GAIE-provided image for the EPP, the Dynamo deployment is treated as a black box and the EPP would route round-robin. In this case GAIE just fans out the traffic, and the smarts only remain within the Dynamo graph. Use this if you have one Dynamo graph and do not want to obtain the Dynamo EPP image. This is a "backup" approach.
+2. **Standard EPP (Fallback):** You can use the default GAIE EPP image, which treats the Dynamo deployment as a black box and routes requests round-robin. Routing intelligence remains within the Dynamo graph itself. Use this approach if you have a single Dynamo graph and don't need the custom EPP image.
+
+EPP’s default kv-routing approach is not token-aware because the prompt is not tokenized. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).

 The setup provided here uses the Dynamo custom EPP by default. Set `epp.useDynamo=false` in your deployment to pick the approach 2.

-EPP’s default kv-routing approach is not token-aware because the prompt is hashed without tokenization. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).
+Dynamo Integration with the Inference Gateway supports Aggregated and Disaggregated Serving.
+If you want to use LoRA deploy Dynamo without the Inference Gateway or in the BlackBox approach with the Inference Gateway.

 Currently, these setups are only supported with the kGateway based Inference Gateway.

@@ -16,7 +19,19 @@ Currently, these setups are only supported with the kGateway based Inference Gat

 - [Prerequisites](#prerequisites)
 - [Installation Steps](#installation-steps)
- [Usage](#6-usage)
+  - [1. Install Dynamo Platform](#1-install-dynamo-platform)
+  - [2. Deploy Inference Gateway](#2-deploy-inference-gateway)
+  - [3. Deploy Your Model](#3-deploy-your-model)
+  - [4. Build EPP image](#4-build-epp-image)
+  - [5. Install Dynamo GAIE helm chart](#5-install-dynamo-gaie-helm-chart)
+  - [6. Verify Installation](#6-verify-installation)
+  - [7. Usage](#7-usage)
+  - [8. Deleting the installation](#8-deleting-the-installation)
+- [Gateway API Inference Extension Details](#gateway-api-inference-extension-integration)
+  - [v1.2.1 API Changes](#v121-api-changes)
+  - [Building for v1.2.1](#building-for-v121)
+  - [Header-Only Routing for v1.2.1](#header-only-routing-for-v121)
+

 ## Prerequisites

@@ -34,19 +49,22 @@ Currently, these setups are only supported with the kGateway based Inference Gat
 First, deploy an inference gateway service. In this example, we'll install `kgateway` based gateway implementation.

 ```bash
-./install_gaie_crd_kgateway.sh
+cd deploy/inference-gateway
+./scripts/install_gaie_crd_kgateway.sh
 ```
+**Note**: The manifest at `config/manifests/gateway/kgateway/gateway.yaml` uses `gatewayClassName: agentgateway`, but kGateway's helm chart creates a GatewayClass named `kgateway`. The patch command in the script fixes this mismatch.

-Verify installation:
+#### f. Verify the Gateway is running

 ```bash
-kubectl get gateway inference-gateway -n my-model
+kubectl get gateway inference-gateway

 # Sample output
 # NAME                CLASS      ADDRESS   PROGRAMMED   AGE
-# inference-gateway   kgateway   x.x.x.x   True         1m
+# inference-gateway   kgateway             True         1m
 ```

+
 ### 3. Deploy Your Model ###

 Follow the steps in [model deployment](../../examples/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace.
@@ -54,7 +72,8 @@ Follow the steps in [model deployment](../../examples/backends/vllm/deploy/READM
 Sample commands to deploy model:

 ```bash
-cd <dynamo-source-root>/examples/backends/vllm/deploy
+cd <dynamo-source-root>
+cd examples/backends/vllm/deploy
 kubectl apply -f agg.yaml -n my-model
 ```

@@ -83,14 +102,42 @@ Create a model configuration file similar to the vllm_agg_qwen.yaml for your mod
 This file demonstrates the values needed for the Vllm Agg setup in [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml)
 Take a note of the model's block size provided in the model card.

-### 4. Install Dynamo GAIE helm chart ###
+### 4. Build EPP image
+
+You can either use the provided Dynamo FrontEnd image for the EPP image or you need to build your own Dynamo EPP custom image following the steps below.
+
+```bash
+# export env vars
+export DOCKER_SERVER=ghcr.io/nvidia/dynamo	# Container registry
+export IMAGE_TAG=YOUR-TAG # Or auto from git tag
+cd deploy/inference-gateway/epp
+make all # Do everything in one command
+# or make all-push to also push
+
+
+# Or step-by-step
+make dynamo-lib # Build Dynamo library and copy to project
+make image-load # Build Docker image and load locally
+make image-push # Build and push to registry
+make info # Check image tag
+```
+
+#### All-in-one Targets
+
+| Target | Description |
+|--------|-------------|
+| `make dynamo-lib` | Build Dynamo static library and copy to project |
+| `make all` | Build Dynamo lib + Docker image + load locally |
+| `make all-push` | Build Dynamo lib + Docker image + push to registry |
+
+### 5. Install Dynamo GAIE helm chart ###

 The Inference Gateway is configured through the `inference-gateway-resources.yaml` file.

 Deploy the Inference Gateway resources to your Kubernetes cluster by running the command below.

 ```bash
-cd deploy/inference-gateway
+cd deploy/inference-gateway/

 # Export the Dynamo image you have used when deploying your model in Step 3.
 export DYNAMO_IMAGE=<the-dynamo-image-you-have-used-when-deploying-the-model>
@@ -122,7 +169,7 @@ You can configure the plugin by setting environment vars in your [values-dynamo-

 - Overwrite the `DYN_NAMESPACE` env var if needed to match your model's dynamo namespace.
 - Set `DYNAMO_BUSY_THRESHOLD` to configure the upper bound on how “full” a worker can be (often derived from kv_active_blocks or other load metrics) before the router skips it. If the selected worker exceeds this value, routing falls back to the next best candidate. By default the value is negative meaning this is not enabled.
- Set `DYNAMO_ROUTER_REPLICA_SYNC=true` to enable a background watcher to keep multiple router instances in sync (important if you run more than one KV router per component).
+- Set `DYNAMO_ENFORCE_DISAGG=true` if you want to enforce every request being served in the disaggregated manner. By default it is false meaning if the the prefill worker is not available the request will be served in the aggregated manner.
 - By default the Dynamo plugin uses KV routing. You can expose `DYNAMO_USE_KV_ROUTING=false`  in your [values-dynamo-epp.yaml] if you prefer to route in the round-robin fashion.
 - If using kv-routing:
  - Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-dynamo-epp.yaml](./values-dynamo-epp.yaml) to match your model's block size.The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures.
@@ -132,52 +179,25 @@ You can configure the plugin by setting environment vars in your [values-dynamo-
  - See the [KV cache routing design](../../docs/router/kv_cache_routing.md) for details.


-
-Dynamo provides a custom routing plugin `pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go` to perform efficient kv routing.
-The Dynamo router is built as a static library, the EPP router will call to provide fast inference.
-You can either use the special FrontEnd image for the EPP_IMAGE in the Helm deployment command and proceed to the step 2 or you can build the image yourself following the steps below.
-
-##### 1. Build the custom EPP image #####
-
-If you choose to build your own image, use the `container/build.sh` script with the `--target frontend` option:
-
-```bash
-./container/build.sh --framework none --target frontend
-```
-
-This command automatically:
- Clones the Gateway API Inference Extension (GAIE) repository at the correct version
- Builds the Dynamo Router static library
- Applies the necessary patches to the EPP codebase
- Builds the custom EPP image with Dynamo KV routing support
- Builds the frontend image with the EPP binary and Dynamo runtime components
-
-Re-tag the freshly built image and push it to your registry:
-
-```bash
-docker images
-docker tag <your-new-id> <your-image-tag>
-docker push <your-image-tag>
-```
-
 **Note**
-You can also use the standard EPP image`us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.4.0`. For the basic black box integration run:
+You can also use the standard EPP image i.e. `us-central1-docker.pkg.dev/k8s-artifacts-prod/images/gateway-api-inference-extension/epp:v1.2.1` for the basic black box integration.

 ```bash
 cd deploy/inference-gateway
+helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml
+
 # Optionally export the standard EPP image if you do not want to use the default we suggest.
 export EPP_IMAGE=us-central1-docker.pkg.dev/k8s-artifacts-prod/images/gateway-api-inference-extension/epp:v0.4.0
-helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set epp.useDynamo=false
+helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set epp.useDynamo=false --set-string extension.image=$EPP_IMAGE
 # Optionally overwrite the image --set-string extension.image=$EPP_IMAGE
 ```

-### 5. Verify Installation ###
+### 6. Verify Installation ###

 Check that all resources are properly deployed:

 ```bash
 kubectl get inferencepool
-kubectl get inferencemodel
 kubectl get httproute
 kubectl get service
 kubectl get gateway
@@ -190,16 +210,12 @@ Sample output:
 NAME        AGE
 qwen-pool   33m

-# kubectl get inferencemodel
-NAME         MODEL NAME        INFERENCE POOL   CRITICALITY   AGE
-qwen-model   Qwen/Qwen3-0.6B   qwen-pool        Critical      33m
-
 # kubectl get httproute
 NAME        HOSTNAMES   AGE
 qwen-route               33m
 ```

-### 6. Usage ###
+### 7. Usage ###

 The Inference Gateway provides HTTP endpoints for model inference.

@@ -310,11 +326,56 @@ Sample inference output:
 }
 ```

-### 7. Deleting the installation ###
+### 8. Deleting the installation ###

 If you need to uninstall run:

 ```bash
 kubectl delete dynamoGraphDeployment vllm-agg
 helm uninstall dynamo-gaie -n my-model
+
+# To uninstall GAIE
+# 1. Delete the inference-gateway
+kubectl delete gateway inference-gateway --ignore-not-found
+
+# 2. Uninstall kgateway helm releases
+helm uninstall kgateway -n kgateway-system
+helm uninstall kgateway-crds -n kgateway-system
+
+# 3. Delete the kgateway-system namespace (optional, cleans up everything in it)
+helm uninstall kgateway --namespace kgateway-system
+kubectl delete namespace kgateway-system --ignore-not-found
+
+# 4. Delete the Inference Extension CRDs
+IGW_LATEST_RELEASE=v1.2.1
+kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${IGW_LATEST_RELEASE}/manifests.yaml --ignore-not-found
+
+# 5. Delete the Gateway API CRDs
+GATEWAY_API_VERSION=v1.4.1
+kubectl delete -f https://github.com/kubernetes-sigs/gateway-api/releases/download/$GATEWAY_API_VERSION/standard-install.yaml --ignore-not-found
 ```
+
+## Gateway API Inference Extension Integration
+
+This section documents the updated plugin implementation for Gateway API Inference Extension **v1.2.1**.
+
+### v1.2.1 API Changes
+
+
+### Building for v1.2.1
+
+The plugin code for v1.2.1 is in:
+- `pkg/plugins/dynamo_kv_scorer/plugin.go`
+
+
+### Header-Only Routing for v1.2.1
+
+In v1.2.1, the EPP uses a **header-only approach** for communicating routing decisions.
+The plugins set HTTP headers that are forwarded to the backend workers.
+
+#### Headers Set by Dynamo Plugins
+
+| Header | Description | Set By |
+|--------|-------------|--------|
+| `x-worker-instance-id` | Primary worker ID (decode worker in disagg mode) | kv-aware-scorer |
+| `x-prefill-instance-id` | Prefill worker ID (disaggregated mode only) | kv-aware-scorer |
--- a/deploy/inference-gateway/build-epp-dynamo.sh
+++ b/deploy/inference-gateway/build-epp-dynamo.sh
-#!/usr/bin/env bash
-
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e  # Exit on any error
-
-# Configuration - Set these environment variables before running
-if [[ -z "${DYNAMO_DIR}" ]]; then
-    echo "DYNAMO_DIR environment variable must be set"
-    echo "   Example: export DYNAMO_DIR=/path/to/dynamo"
-    exit 1
-fi
-
-if [[ -z "${GAIE_DIR}" ]]; then
-    echo "GAIE_DIR environment variable must be set"
-    echo "   Example: export GAIE_DIR=/path/to/gateway-api-inference-extension"
-    exit 1
-fi
-DYNAMO_LIB_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib"
-DYNAMO_INCLUDE_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/include"
-
-echo "Building Dynamo KV Router C Library..."
-
-# Step 1: Build the static library
-echo "Building static library..."
-cd "${DYNAMO_DIR}"
-cargo build --release -p libdynamo_llm
-
-# Step 2: Generate header file (with fallback)
-echo "Generating C header..."
-HEADER_OUTPUT="${DYNAMO_DIR}/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h"
-
-if ! cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm --output "${HEADER_OUTPUT}"; then
-    echo "cbindgen failed, using fallback header..."
-    cp "${DYNAMO_DIR}/lib/bindings/c/src/fallback_header.h" "${HEADER_OUTPUT}"
-fi
-
-# Step 3: Ensure directories exist
-echo "Preparing directories..."
-mkdir -p "${DYNAMO_LIB_DIR}"
-mkdir -p "${DYNAMO_INCLUDE_DIR}"
-
-# Step 4: Copy files to GAIE project
-echo "Copying files to the GAIE project..."
-cp "${HEADER_OUTPUT}" "${DYNAMO_INCLUDE_DIR}/"
-cp "${DYNAMO_DIR}/target/release/libdynamo_llm_capi.a" "${DYNAMO_LIB_DIR}/"
-cp "${DYNAMO_DIR}/container/Dockerfile.epp" "${GAIE_DIR}/Dockerfile.dynamo"
-
-# Verify files were copied
-if [[ ! -f "${DYNAMO_INCLUDE_DIR}/llm_engine.h" ]]; then
-    echo "Header file copy failed!"
-    exit 1
-fi
-
-if [[ ! -f "${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a" ]]; then
-    echo "Library file copy failed!"
-    exit 1
-fi
-
-if [[ ! -f "${GAIE_DIR}/Dockerfile.dynamo" ]]; then
-    echo "Docker.dynamo file copy failed!"
-    exit 1
-fi
-
-echo "Files copied successfully:"
-echo "   Header: ${DYNAMO_INCLUDE_DIR}/llm_engine.h"
-echo "   Library: ${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a"
-echo "   Docker: ${GAIE_DIR}/Dockerfile.epp"
-
-# Step 5: Apply Dynamo patch (if it exists)
-echo "Applying Dynamo patch..."
-cd "${GAIE_DIR}"
-
-PATCH_FILE="${DYNAMO_DIR}/deploy/inference-gateway/epp-patches/v0.8.0/gaie.patch"
-if [[ -f "${PATCH_FILE}" ]]; then
-    if git apply --check "${PATCH_FILE}" 2>/dev/null; then
-        git apply "${PATCH_FILE}"
-        echo "Patch applied successfully"
-    else
-        echo "Patch doesn't apply cleanly - may already be applied or need manual resolution"
-    fi
-else
-    echo "No patch file found at ${PATCH_FILE}"
-fi
-
-# Step 6: Build the EPP image
-echo "Building the custom EPP image for GAIE..."
-
-# Build make args - pass DOCKER_PROXY if set (e.g., from ECR_HOSTNAME)
-MAKE_ARGS=""
-if [[ -n "${DOCKER_PROXY}" ]]; then
-    echo "Using DOCKER_PROXY: ${DOCKER_PROXY}"
-    MAKE_ARGS+="DOCKER_PROXY=${DOCKER_PROXY} "
-fi
-
-make ${MAKE_ARGS} dynamo-image-local-load
-
-echo "EPP image with Dynamo KV routing built"
--- a/deploy/inference-gateway/epp-patches/v0.5.1-1/epp-v0.5.1-dyn1.patch
+++ b/deploy/inference-gateway/epp-patches/v0.5.1-1/epp-v0.5.1-dyn1.patch
--- a/deploy/inference-gateway/epp-patches/v0.5.1-2/epp-v0.5.1-dyn2.patch
+++ b/deploy/inference-gateway/epp-patches/v0.5.1-2/epp-v0.5.1-dyn2.patch
--- a/deploy/inference-gateway/epp-patches/v0.8.0/gaie.patch
+++ b/deploy/inference-gateway/epp-patches/v0.8.0/gaie.patch
--- a/deploy/inference-gateway/epp/Dockerfile
+++ b/deploy/inference-gateway/epp/Dockerfile
+# SPDX-FileCopyrightText:  Copyright The Kubernetes Authors.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#  http://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#  Modifications Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES
+
+# Dynamo EPP Dockerfile
+# Builds a custom EPP image with Dynamo KV-aware routing plugins
+#
+# PREREQUISITES: Run `make dynamo-lib` before building this image to ensure
+# the Dynamo FFI library and headers are in place.
+
+ARG BUILDER_IMAGE=golang:1.24-bookworm
+ARG BASE_IMAGE=ubuntu:24.04
+
+# =============================================================================
+# Build stage
+# =============================================================================
+FROM ${BUILDER_IMAGE} AS builder
+
+# Docker buildx provides these automatically for multi-platform builds
+ARG TARGETOS=linux
+ARG TARGETARCH
+
+ARG COMMIT_SHA
+ARG BUILD_REF
+
+WORKDIR /workspace
+
+# Install build dependencies for CGO
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    libc-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy go mod files first for better caching
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy the source code (including pre-built Dynamo library)
+COPY . .
+
+# Verify Dynamo library exists
+RUN if [ ! -f "pkg/plugins/dynamo_kv_scorer/lib/libdynamo_llm_capi.a" ]; then \
+        echo "ERROR: Dynamo library not found!"; \
+        echo "Run 'make dynamo-lib' before building the Docker image."; \
+        exit 1; \
+    fi
+
+# Build with CGO enabled for the Dynamo FFI
+# Use TARGETOS/TARGETARCH from Docker buildx for proper platform support
+RUN CGO_ENABLED=1 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build \
+    -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.GitVersion=${BUILD_REF} \
+              -X sigs.k8s.io/gateway-api-inference-extension/version.GitCommit=${COMMIT_SHA}" \
+    -o epp ./cmd/epp
+
+# =============================================================================
+# Runtime stage
+# =============================================================================
+FROM ${BASE_IMAGE}
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    ca-certificates \
+    libstdc++6 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /
+
+# Copy the binary from builder
+COPY --from=builder /workspace/epp .
+
+# Note: EPP config is mounted via Kubernetes ConfigMap at runtime
+# See helm/dynamo-gaie/templates/epp-configmap.yaml
+
+# Create non-root user
+RUN useradd -r -u 65532 -g nogroup nonroot
+USER 65532:65534
+
+ENTRYPOINT ["/epp"]
--- a/deploy/inference-gateway/epp/Makefile
+++ b/deploy/inference-gateway/epp/Makefile
+# Dynamo EPP Makefile
+# Builds custom EPP image with Dynamo KV-aware routing plugins
+
+# Image configuration
+# Image lives in local cache only, not pushed to any registry
+DOCKER_SERVER ?= dynamo
+IMAGE_NAME := dynamo-epp
+GIT_COMMIT_SHA ?= $(shell git rev-parse HEAD 2>/dev/null || echo "unknown")
+GIT_TAG ?= $(shell git describe --tags --dirty --always 2>/dev/null || echo "dev")
+IMAGE_REPO ?= $(DOCKER_SERVER)/$(IMAGE_NAME)
+IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
+
+# Build configuration
+# Auto-detect host architecture for consistent builds with Dynamo library
+# The Dynamo library is built for the host arch, so Docker must match
+HOST_ARCH := $(shell uname -m)
+ifeq ($(HOST_ARCH),x86_64)
+    PLATFORMS ?= linux/amd64
+else ifeq ($(HOST_ARCH),aarch64)
+    PLATFORMS ?= linux/arm64
+else ifeq ($(HOST_ARCH),arm64)
+    PLATFORMS ?= linux/arm64
+else
+    PLATFORMS ?= linux/amd64
+endif
+# Docker proxy for avoiding rate limits (e.g., ECR mirror)
+# Set DOCKER_PROXY to prefix base images, e.g., DOCKER_PROXY=my-registry.com/dockerhub/
+DOCKER_PROXY ?=
+
+DOCKER_BUILDX_CMD ?= docker buildx
+IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
+BUILDER_IMAGE ?= $(DOCKER_PROXY)golang:1.24
+BASE_IMAGE ?= $(DOCKER_PROXY)ubuntu:24.04
+
+# Container tool
+CONTAINER_TOOL ?= docker
+
+# Kind cluster name for local testing
+KIND_CLUSTER ?= kind
+
+# Project directory
+PROJECT_DIR := $(shell pwd)
+
+# Dynamo directories
+# Default: assume we're in dynamo/deploy/inference-gateway/epp
+DYNAMO_DIR ?= $(shell cd $(PROJECT_DIR)/../../.. && pwd)
+DYNAMO_LIB_DIR := $(PROJECT_DIR)/pkg/plugins/dynamo_kv_scorer/lib
+DYNAMO_INCLUDE_DIR := $(PROJECT_DIR)/pkg/plugins/dynamo_kv_scorer/include
+
+.PHONY: help
+help: ## Display this help
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
+
+##@ Development
+
+.PHONY: fmt
+fmt: ## Run go fmt
+	go fmt ./...
+
+.PHONY: vet
+vet: ## Run go vet
+	go vet ./...
+
+.PHONY: tidy
+tidy: ## Run go mod tidy
+	go mod tidy
+
+.PHONY: test
+test: ## Run tests
+	CGO_ENABLED=1 go test ./... -v
+
+##@ Build
+
+.PHONY: build
+build: dynamo-lib-check ## Build the EPP binary locally (requires CGO and Dynamo libraries)
+	CGO_ENABLED=1 go build -o bin/epp ./cmd/epp
+
+.PHONY: build-with-lib
+build-with-lib: dynamo-lib build ## Build Dynamo library and EPP binary
+
+.PHONY: image-build
+image-build: dynamo-lib-check ## Build the Docker image using buildx
+	$(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \
+		--platform=$(PLATFORMS) \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
+		--build-arg COMMIT_SHA=$(GIT_COMMIT_SHA) \
+		--build-arg BUILD_REF=$(GIT_TAG) \
+		$(PUSH) \
+		$(LOAD) \
+		.
+
+.PHONY: image-push
+image-push: PUSH=--push ## Build and push the Docker image
+image-push: image-build
+
+.PHONY: image-load
+image-load: LOAD=--load ## Build and load the Docker image locally
+image-load: image-build
+
+.PHONY: image-kind
+image-kind: image-load ## Build and load the image into kind cluster
+	kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
+
+##@ Local Development with Buildx
+
+.PHONY: image-local-build
+image-local-build: ## Build image using a new buildx builder
+	BUILDER=$$($(DOCKER_BUILDX_CMD) create --use) && \
+	$(MAKE) image-build PUSH=$(PUSH) LOAD=$(LOAD) && \
+	$(DOCKER_BUILDX_CMD) rm $$BUILDER
+
+.PHONY: image-local-push
+image-local-push: PUSH=--push ## Build and push using local buildx builder
+image-local-push: image-local-build
+
+.PHONY: image-local-load
+image-local-load: LOAD=--load ## Build and load using local buildx builder
+image-local-load: image-local-build
+
+##@ Dynamo Library Build
+
+.PHONY: dynamo-lib
+dynamo-lib: ## Build Dynamo static library and copy to project
+	@echo "Building Dynamo static library..."
+	cd "$(DYNAMO_DIR)" && cargo build --release -p libdynamo_llm
+	@echo "Generating C header..."
+	@mkdir -p "$(DYNAMO_DIR)/lib/bindings/c/include/nvidia/dynamo_llm"
+	cd "$(DYNAMO_DIR)" && \
+		(cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm \
+			--output lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h || \
+		cp lib/bindings/c/src/fallback_header.h lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h)
+	@echo "Copying files to EPP project..."
+	@mkdir -p "$(DYNAMO_LIB_DIR)"
+	@mkdir -p "$(DYNAMO_INCLUDE_DIR)"
+	cp "$(DYNAMO_DIR)/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h" "$(DYNAMO_INCLUDE_DIR)/"
+	cp "$(DYNAMO_DIR)/target/release/libdynamo_llm_capi.a" "$(DYNAMO_LIB_DIR)/"
+	@echo "Dynamo library ready!"
+
+.PHONY: dynamo-lib-check
+dynamo-lib-check: ## Check if Dynamo library files exist
+	@if [ ! -f "$(DYNAMO_LIB_DIR)/libdynamo_llm_capi.a" ]; then \
+		echo "ERROR: Dynamo library not found. Run 'make dynamo-lib' first."; \
+		exit 1; \
+	fi
+	@if [ ! -f "$(DYNAMO_INCLUDE_DIR)/llm_engine.h" ]; then \
+		echo "ERROR: Dynamo header not found. Run 'make dynamo-lib' first."; \
+		exit 1; \
+	fi
+	@echo "Dynamo library files found."
+
+##@ Clean
+
+.PHONY: clean
+clean: ## Clean build artifacts
+	rm -rf bin/
+	go clean
+
+##@ All-in-one Build
+
+.PHONY: all
+all: dynamo-lib image-local-load ## Build Dynamo lib and Docker image, load locally
+
+.PHONY: all-push
+all-push: dynamo-lib image-push ## Build Dynamo lib and Docker image, push to registry
+
+.PHONY: all-kind
+all-kind: dynamo-lib image-kind ## Build Dynamo lib and Docker image, load to kind
+
+##@ Info
+
+.PHONY: info
+info: ## Show build info
+	@echo "Image Tag: $(IMAGE_TAG)"
+	@echo "Git Commit: $(GIT_COMMIT_SHA)"
+	@echo "Git Tag: $(GIT_TAG)"
+	@echo "Platforms: $(PLATFORMS)"
+	@echo "Docker Proxy: $(DOCKER_PROXY)"
+	@echo "Builder Image: $(BUILDER_IMAGE)"
+	@echo "Base Image: $(BASE_IMAGE)"
+	@echo "Dynamo Dir: $(DYNAMO_DIR)"
+	@echo "Dynamo Lib Dir: $(DYNAMO_LIB_DIR)"
+	@echo "Dynamo Include Dir: $(DYNAMO_INCLUDE_DIR)"
+
--- a/deploy/inference-gateway/epp/cmd/epp/main.go
+++ b/deploy/inference-gateway/epp/cmd/epp/main.go
+/*
+Copyright 2025 NVIDIA Corporation.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Dynamo EPP - Custom Endpoint Picker Plugin for NVIDIA Dynamo
+//
+// This EPP integrates with the Gateway API Inference Extension to provide
+// KV-aware routing for Dynamo inference backends.
+//
+// # Header-Based Routing
+//
+// The Dynamo KV scorer sets routing headers that the Lua filter at the
+// gateway uses to inject nvext into the request body:
+//
+//   - x-worker-instance-id: Selected worker ID (decode worker in disagg mode)
+//   - x-prefiller-host-port: Prefill worker ID (disaggregated mode only)
+//   - x-dynamo-routing-mode: "aggregated" or "disaggregated"
+//
+// The Lua filter reads these headers and injects:
+//   - Aggregated: {"nvext": {"backend_instance_id": <worker_id>}}
+//   - Disaggregated: {"nvext": {"prefill_worker_id": <prefill>, "decode_worker_id": <decode>}}
+package main
+
+import (
+	"os"
+
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+
+	// Dynamo plugins
+	dynscorer "github.com/nvidia/dynamo/deploy/inference-gateway/pkg/plugins/dynamo_kv_scorer"
+)
+
+func main() {
+	// Register Dynamo custom plugins:
+	// - kv-aware-scorer: Implements Scorer, PreRequest, and ResponseComplete interfaces
+	//   - Score: Calls Dynamo router to select workers based on KV cache, sets routing headers
+	//   - PreRequest: Registers request with router bookkeeping after scheduling is finalized
+	//   - ResponseComplete: Cleans up router bookkeeping when response completes
+	plugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
+
+	// Run using standard GAIE runner (it registers built-in plugins automatically)
+	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
+		os.Exit(1)
+	}
+}
--- a/deploy/inference-gateway/epp/go.mod
+++ b/deploy/inference-gateway/epp/go.mod
+module github.com/nvidia/dynamo/deploy/inference-gateway
+
+go 1.24.0
+
+require (
+	sigs.k8s.io/controller-runtime v0.22.4
+	sigs.k8s.io/gateway-api-inference-extension v1.2.1
+)
+
+require (
+	cel.dev/expr v0.24.0 // indirect
+	github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
+	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/blang/semver/v4 v4.0.0 // indirect
+	github.com/cenkalti/backoff/v5 v5.0.3 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
+	github.com/dennwc/varint v1.0.0 // indirect
+	github.com/emicklei/go-restful/v3 v3.13.0 // indirect
+	github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect
+	github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
+	github.com/evanphx/json-patch/v5 v5.9.11 // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
+	github.com/fsnotify/fsnotify v1.9.0 // indirect
+	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
+	github.com/go-logr/zapr v1.3.0 // indirect
+	github.com/go-openapi/jsonpointer v0.21.2 // indirect
+	github.com/go-openapi/jsonreference v0.21.0 // indirect
+	github.com/go-openapi/swag v0.23.1 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/google/btree v1.1.3 // indirect
+	github.com/google/cel-go v0.26.0 // indirect
+	github.com/google/gnostic-models v0.7.0 // indirect
+	github.com/google/go-cmp v0.7.0 // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect
+	github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect
+	github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
+	github.com/inconshreveable/mousetrap v1.1.0 // indirect
+	github.com/josharian/intern v1.0.0 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/mailru/easyjson v0.9.0 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+	github.com/prometheus/client_golang v1.23.2 // indirect
+	github.com/prometheus/client_model v0.6.2 // indirect
+	github.com/prometheus/common v0.67.4 // indirect
+	github.com/prometheus/procfs v0.17.0 // indirect
+	github.com/prometheus/prometheus v0.308.1 // indirect
+	github.com/spf13/cobra v1.9.1 // indirect
+	github.com/spf13/pflag v1.0.10 // indirect
+	github.com/stoewer/go-strcase v1.3.0 // indirect
+	github.com/x448/float16 v0.8.4 // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
+	go.opentelemetry.io/otel v1.39.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect
+	go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 // indirect
+	go.opentelemetry.io/otel/metric v1.39.0 // indirect
+	go.opentelemetry.io/otel/sdk v1.39.0 // indirect
+	go.opentelemetry.io/otel/trace v1.39.0 // indirect
+	go.opentelemetry.io/proto/otlp v1.9.0 // indirect
+	go.uber.org/atomic v1.11.0 // indirect
+	go.uber.org/multierr v1.11.0 // indirect
+	go.uber.org/zap v1.27.1 // indirect
+	go.yaml.in/yaml/v2 v2.4.3 // indirect
+	go.yaml.in/yaml/v3 v3.0.4 // indirect
+	golang.org/x/exp v0.0.0-20250808145144-a408d31f581a // indirect
+	golang.org/x/net v0.47.0 // indirect
+	golang.org/x/oauth2 v0.32.0 // indirect
+	golang.org/x/sync v0.19.0 // indirect
+	golang.org/x/sys v0.39.0 // indirect
+	golang.org/x/term v0.37.0 // indirect
+	golang.org/x/text v0.31.0 // indirect
+	golang.org/x/time v0.13.0 // indirect
+	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect
+	google.golang.org/grpc v1.78.0 // indirect
+	google.golang.org/protobuf v1.36.11 // indirect
+	gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
+	gopkg.in/inf.v0 v0.9.1 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+	k8s.io/api v0.34.3 // indirect
+	k8s.io/apiextensions-apiserver v0.34.3 // indirect
+	k8s.io/apimachinery v0.34.3 // indirect
+	k8s.io/apiserver v0.34.3 // indirect
+	k8s.io/client-go v0.34.3 // indirect
+	k8s.io/component-base v0.34.3 // indirect
+	k8s.io/klog/v2 v2.130.1 // indirect
+	k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
+	k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d // indirect
+	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect
+	sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
+	sigs.k8s.io/randfill v1.0.0 // indirect
+	sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
+	sigs.k8s.io/yaml v1.6.0 // indirect
+)
+
+// NOTE: For local development, uncomment the replace directive below.
+// For Docker builds, keep it commented out to use the published v1.2.1 release.
+// replace sigs.k8s.io/gateway-api-inference-extension => ../../../gaie_latest/gateway-api-inference-extension
--- a/deploy/inference-gateway/epp/go.sum
+++ b/deploy/inference-gateway/epp/go.sum
--- a/deploy/inference-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
+++ b/deploy/inference-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
--- a/deploy/inference-gateway/helm/dynamo-gaie/epp-config-dynamo.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/epp-config-dynamo.yaml
@@ -13,6 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# Dynamo EPP Configuration
+#
+# The KV scorer sets routing headers that the Lua filter at the gateway
+# reads to inject nvext into the request body:
+#   - x-worker-instance-id: Selected worker ID
+#   - x-prefiller-host-port: Prefill worker (disaggregated mode)
+#   - x-dynamo-routing-mode: "aggregated" or "disaggregated"
+
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
@@ -22,14 +30,15 @@ plugins:
  # Picker: chooses the final endpoint after scoring
  - name: picker
    type: max-score-picker
-  - name: dyn-pre
-    type: dynamo-inject-workerid
-    parameters: {}
+
+  # Dynamo KV-aware Scorer: calls Dynamo router FFI for worker selection
+  # Implements Scorer, PreRequest, and ResponseComplete:
+  # - Score: Selects workers based on KV cache, sets routing headers
+  # - PreRequest: Registers request with router bookkeeping
+  # - ResponseComplete: Frees router bookkeeping when response completes
  - name: dyn-kv
    type: kv-aware-scorer
-    parameters:
-      frontendURL: http://127.0.0.1:8000/v1/chat/completions
-      timeoutMS: 10000
+
 schedulingProfiles:
  - name: default
    plugins:

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role-binding.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role-binding.yaml
@@ -22,4 +22,5 @@ subjects:
  namespace: {{ .Release.Namespace }}
 roleRef:
  kind: ClusterRole
-  name: pod-read
\ No newline at end of file
+  name: pod-read
+  apiGroup: rbac.authorization.k8s.io
\ No newline at end of file
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role.yaml
@@ -19,10 +19,10 @@ metadata:
 rules:
 # Gateway API inference resources
 - apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencepools"]
+  resources: ["inferencepools", "inferenceobjectives", "inferencemodelrewrites"]
  verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencemodels"]
+- apiGroups: ["inference.networking.k8s.io"]
+  resources: ["inferencepools"]
  verbs: ["get", "watch", "list"]
 # Core resources for pod discovery
 - apiGroups: [""]

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 {{- /* ------------ file-scope vars (no output) ------------ */ -}}
 {{- $platformNs   := default .Release.Namespace .Values.platformNamespace -}}
 {{- $platformName := default "dynamo-platform" .Values.platformReleaseName -}}
@@ -23,10 +24,10 @@
 {{- $std          := .Values.extension.standardImage -}}
 {{- $dyn          := .Values.extension.dynamoImage -}}
 {{- $fallback     := ternary $dyn $std .Values.epp.useDynamo -}}
-{{- $eppImage     := default $fallback .Values.extension.image -}}
-
+{{- $eppImage     := default $fallback .Values.extension.image }}

---  # <-- start of actual YAML document
+---
+# Deployment for the EPP (Endpoint Picker Plugin)
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -61,26 +62,30 @@ spec:
        {{- if .Values.epp.argsOverride }}
        {{- toYaml .Values.epp.argsOverride | nindent 8 }}
        {{- else }}
-          - -poolName
+          - -pool-name
          - "{{ .Values.model.shortName }}-pool"
-          - -poolNamespace
+          - -pool-namespace
          - "{{ .Release.Namespace }}"
+          - -pool-group
+          - "inference.networking.x-k8s.io"
          - -v
          - "4"
          - --zap-encoder
          - "json"
-          - -grpcPort
+          - -grpc-port
          - "9002"
-          - -grpcHealthPort
+          - -grpc-health-port
          - "9003"
          {{- if $useDynamo }}
-          - -configFile
+          - -config-file
          - "{{ .Values.epp.configFile }}"
          {{- end }}
        {{- end }}

-        {{- if $useDynamo }}
        volumeMounts:
+          - name: hf-cache
+            mountPath: /home/nonroot/.cache
+        {{- if $useDynamo }}
          - name: epp-config
            mountPath: /etc/epp
            readOnly: true
@@ -117,11 +122,21 @@ spec:
            value: "true"
          - name: USE_STREAMING
            value: "true"
+          # HuggingFace token for downloading model config files
+          # Without this, HuggingFace rate-limits requests (429 Too Many Requests)
+          - name: HF_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: HF_TOKEN
+                optional: true
        {{- end }}
        {{- range .Values.epp.extraEnv }}
          - name: {{ .name }}
            value: {{ .value | quote }}
        {{- end }}
+          - name: RUST_LOG
+            value: "debug,dynamo_llm::kv_router=trace"

        ports:
          - containerPort: 9002
@@ -141,8 +156,10 @@ spec:
          initialDelaySeconds: 5
          periodSeconds: 10

-      {{- if $useDynamo }}
      volumes:
+        - name: hf-cache
+          emptyDir: {}
+      {{- if $useDynamo }}
        - name: epp-config
          configMap:
            name: {{ include "dynamo-gaie.fullname" . }}-epp-config

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/http-router.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/http-router.yaml
@@ -14,6 +14,8 @@
 # limitations under the License.

 {{- if .Values.httpRoute.enabled }}
+{{- /* Default gatewayNamespace to the release namespace if not specified */ -}}
+{{- $gatewayNs := default .Release.Namespace .Values.httpRoute.gatewayNamespace }}
 apiVersion: gateway.networking.k8s.io/v1
 kind: HTTPRoute
 metadata:
@@ -24,9 +26,10 @@ spec:
  - group: gateway.networking.k8s.io
    kind: Gateway
    name: {{ .Values.httpRoute.gatewayName }}
+    namespace: {{ $gatewayNs }}
  rules:
  - backendRefs:
-    - group: inference.networking.x-k8s.io
+    - group: inference.networking.k8s.io
      kind: InferencePool
      name: {{ .Values.model.shortName }}-pool
      namespace: {{ .Release.Namespace }}

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/inference-model.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/inference-model.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  name: {{ .Values.model.shortName }}-model
-  namespace: {{ .Release.Namespace }}
-spec:
-  criticality: {{ .Values.model.criticality }}
-  modelName: {{ .Values.model.identifier }}
-  poolRef:
-    group: inference.networking.x-k8s.io
-    kind: InferencePool
-    name: {{ .Values.model.shortName }}-pool
\ No newline at end of file