feat: update GAIE to release version with hints in headers (#5503)

Signed-off-by: Anna Tchernych <atchernych@nvidia.com>

feat: update GAIE to release version with hints in headers (#5503)
Signed-off-by: Anna Tchernych <atchernych@nvidia.com>
4810ad34 · atchernych · GitHub · b31b5b56 · b31b5b56 · 4810ad34
Unverified Commit 4810ad34 authored Jan 22, 2026 by atchernych Committed by GitHub Jan 22, 2026
20 changed files
--- a/container/Dockerfile.epp
+++ b/container/Dockerfile.epp
-#  SPDX-FileCopyrightText:  Copyright The Kubernetes Authors.
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#  #
-#  http://www.apache.org/licenses/LICENSE-2.0
-#  #
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#  Modifications Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES
-
-# Dockerfile.epp - Custom Dockerfile for GAIE EPP. This is to be used with the deploy/inference-gateway/build-epp-dynamo.sh
-
-ARG DOCKER_PROXY
-ARG BUILDER_IMAGE="golang:1.24"
-ARG BASE_IMAGE="ubuntu:22.04"
-
-############################
-# Builder
-############################
-FROM ${DOCKER_PROXY}${BUILDER_IMAGE} AS builder
-
-ENV CGO_ENABLED=1
-# be explicit; helps cgo when linking libstdc++
-ENV CC=gcc
-ENV CXX=g++
-
-# C/C++ toolchain for cgo, and libstdc++ for link-time
-RUN apt-get update && apt-get install -y --no-install-recommends \
-  build-essential \
-  gcc g++ \
-  libc6-dev \
-  ca-certificates \
-  && rm -rf /var/lib/apt/lists/*
-
-ARG COMMIT_SHA=unknown
-ARG BUILD_REF
-
-WORKDIR /src
-
-# deps first (cache)
-COPY go.mod go.sum ./
-RUN go mod download
-
-# source
-COPY cmd/epp ./cmd/epp
-COPY pkg/epp ./pkg/epp
-COPY internal ./internal
-COPY api ./api
-
-# sanity (optional)
-RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/include/ || echo "Headers not found"
-RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib/ || echo "Library not found"
-
-# build
-WORKDIR /src/cmd/epp
-RUN go build \
-  -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" \
-  -o /epp
-
-############################
-# Runtime
-############################
-FROM ${DOCKER_PROXY}${BASE_IMAGE} AS runtime
-
-ARG DYNAMO_COMMIT_SHA
-ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
-
-# Minimal runtime deps; include libstdc++ runtime for -lstdc++
-RUN apt-get update && apt-get install -y --no-install-recommends \
-  ca-certificates \
-  libstdc++6 \
-  && rm -rf /var/lib/apt/lists/* \
-  && groupadd -r nonroot && useradd -r -g nonroot -m -d /home/nonroot nonroot \
-  && mkdir -p /home/nonroot/.cache/huggingface/hub \
-  && chown -R nonroot:nonroot /home/nonroot
-
-WORKDIR /
-COPY --from=builder /epp /epp
-
-# Set HOME so ModelExpress can find the cache directory
-ENV HOME=/home/nonroot
-
-USER nonroot:nonroot
-ENTRYPOINT ["/epp"]
--- a/container/README.md
+++ b/container/README.md
@@ -199,8 +199,8 @@ The frontend image is a specialized container that includes the Dynamo component
 ```

 The build process automatically:
-1. Clones the Gateway API Inference Extension (GAIE) repository
-2. Builds the custom EPP image with Dynamo routing capabilities
+1. Builds the Dynamo static library for EPP KV-aware routing
+2. Builds the custom EPP Docker image using `make all` from `deploy/inference-gateway/epp/Makefile`
 3. Builds the frontend image with the EPP binary and Dynamo runtime components

 For more details, see [`deploy/inference-gateway/README.md`](../deploy/inference-gateway/README.md).

--- a/container/build.sh
+++ b/container/build.sh
@@ -138,10 +138,6 @@ SGLANG_CUDA_VERSION="12.9.1"
 SGLANG_CUDA_VERSION_CU13="13.0.1"
 SGLANG_RUNTIME_IMAGE_TAG_CU13="v0.5.7-cu130-runtime"

-# GAIE (Gateway API Inference Extension) configuration for frontend (required for EPP binary for frontend image)
-GAIE_REPO_URL="https://github.com/kubernetes-sigs/gateway-api-inference-extension.git"
-GAIE_VERSION="v0.5.1"
-
 PYTHON_VERSION="3.12"

 NIXL_REF=0.8.0
@@ -969,39 +965,33 @@ show_image_options
 # Handle FRONTEND target: build EPP image first
 if [[ ${TARGET^^} == "FRONTEND" ]]; then
    echo "Building FRONTEND image - requires EPP image"
-
-    # Build base dynamo image first (framework=NONE, target=dev)
    echo ""
-    echo "Building EPP image for Frontend..."
-    # Set up paths for GAIE
-    GAIE_CLONE_DIR="${BUILD_CONTEXT}/.build/external/gateway-api-inference-extension"
+    echo "Building EPP image for Frontend using Makefile..."

-    # Clone GAIE repo
-    echo ""
-    echo "Cloning GAIE repository at ${GAIE_VERSION}..."
-    $RUN_PREFIX rm -rf "${GAIE_CLONE_DIR}"
-    $RUN_PREFIX mkdir -p "$(dirname "${GAIE_CLONE_DIR}")"
-    $RUN_PREFIX git clone ${GAIE_REPO_URL} "${GAIE_CLONE_DIR}"
-    $RUN_PREFIX cd "${GAIE_CLONE_DIR}"
-    $RUN_PREFIX git checkout ${GAIE_VERSION}
-    $RUN_PREFIX cd "${BUILD_CONTEXT}"
-
-    # Build EPP image
-    echo ""
-    echo "Building EPP image..."
-    export GAIE_DIR="${GAIE_CLONE_DIR}"
-    export DYNAMO_DIR="${BUILD_CONTEXT}"
+    # EPP directory with the new self-contained build
+    EPP_DIR="${BUILD_CONTEXT}/deploy/inference-gateway/epp"

    # Set DOCKER_PROXY from ECR_HOSTNAME if available (for pulling base images through proxy)
+    # This prevents rate-limiting when building in CI across multiple PRs
+    DOCKER_PROXY_ARG=""
    if [[ -n "${ECR_HOSTNAME}" ]]; then
-        export DOCKER_PROXY="${ECR_HOSTNAME}/dockerhub/"
+        DOCKER_PROXY="${ECR_HOSTNAME}/dockerhub/"
+        DOCKER_PROXY_ARG="DOCKER_PROXY=${DOCKER_PROXY}"
        echo "Using DOCKER_PROXY: ${DOCKER_PROXY}"
    fi

-    $RUN_PREFIX bash ${DYNAMO_DIR}/deploy/inference-gateway/build-epp-dynamo.sh
-
-    # Set EPP image tag (matches what build-epp-dynamo.sh produces)
-    EPP_IMAGE_TAG="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:${GAIE_VERSION}-dirty"
+    # Build EPP image using the Makefile
+    # The Makefile handles: building Dynamo library, building Docker image, loading it locally
+    $RUN_PREFIX make -C "${EPP_DIR}" all DYNAMO_DIR="${BUILD_CONTEXT}" ${DOCKER_PROXY_ARG}
+
+    # Compute EPP image tag (must match Makefile's IMAGE_TAG)
+    # IMAGE_TAG = $(IMAGE_REPO):$(GIT_TAG)
+    # IMAGE_REPO = $(DOCKER_SERVER)/$(IMAGE_NAME)
+    # Image lives in local cache only, not pushed to any registry
+    EPP_DOCKER_SERVER="dynamo"
+    EPP_IMAGE_NAME="dynamo-epp"
+    EPP_GIT_TAG=$(git describe --tags --dirty --always 2>/dev/null || echo "dev")
+    EPP_IMAGE_TAG="${EPP_DOCKER_SERVER}/${EPP_IMAGE_NAME}:${EPP_GIT_TAG}"

    echo "Successfully built EPP image: ${EPP_IMAGE_TAG}"


--- a/deploy/inference-gateway/README.md
+++ b/deploy/inference-gateway/README.md
 ## Inference Gateway Setup with Dynamo

-When integrating Dynamo with the Inference Gateway you could either use the default EPP image provided by the extension or use the custom Dynamo image.
+When integrating Dynamo with the Inference Gateway it is recommended to use the custom Dynamo EPP image.

-1. When using the Dynamo custom EPP image you will take advantage of the Dynamo router when EPP chooses the best worker to route the request to. This setup uses a custom Dynamo plugin `dyn-kv` to pick the best worker. In this case the Dynamo routing logic is moved upstream. We recommend this approach.
+1. **Dynamo EPP (Recommended):** The custom Dynamo EPP image integrates the Dynamo router directly into the gateway's endpoint picker. Using the `dyn-kv` plugin, it selects the optimal worker based on KV cache state and tokenized prompt before routing the request. The integration moves intelligent routing upstream to the gateway layer.

-2. When using the GAIE-provided image for the EPP, the Dynamo deployment is treated as a black box and the EPP would route round-robin. In this case GAIE just fans out the traffic, and the smarts only remain within the Dynamo graph. Use this if you have one Dynamo graph and do not want to obtain the Dynamo EPP image. This is a "backup" approach.
+2. **Standard EPP (Fallback):** You can use the default GAIE EPP image, which treats the Dynamo deployment as a black box and routes requests round-robin. Routing intelligence remains within the Dynamo graph itself. Use this approach if you have a single Dynamo graph and don't need the custom EPP image.
+
+EPP’s default kv-routing approach is not token-aware because the prompt is not tokenized. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).

 The setup provided here uses the Dynamo custom EPP by default. Set `epp.useDynamo=false` in your deployment to pick the approach 2.

-EPP’s default kv-routing approach is not token-aware because the prompt is hashed without tokenization. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).
+Dynamo Integration with the Inference Gateway supports Aggregated and Disaggregated Serving.
+If you want to use LoRA deploy Dynamo without the Inference Gateway or in the BlackBox approach with the Inference Gateway.

 Currently, these setups are only supported with the kGateway based Inference Gateway.

@@ -16,7 +19,19 @@ Currently, these setups are only supported with the kGateway based Inference Gat

 - [Prerequisites](#prerequisites)
 - [Installation Steps](#installation-steps)
- [Usage](#6-usage)
+  - [1. Install Dynamo Platform](#1-install-dynamo-platform)
+  - [2. Deploy Inference Gateway](#2-deploy-inference-gateway)
+  - [3. Deploy Your Model](#3-deploy-your-model)
+  - [4. Build EPP image](#4-build-epp-image)
+  - [5. Install Dynamo GAIE helm chart](#5-install-dynamo-gaie-helm-chart)
+  - [6. Verify Installation](#6-verify-installation)
+  - [7. Usage](#7-usage)
+  - [8. Deleting the installation](#8-deleting-the-installation)
+- [Gateway API Inference Extension Details](#gateway-api-inference-extension-integration)
+  - [v1.2.1 API Changes](#v121-api-changes)
+  - [Building for v1.2.1](#building-for-v121)
+  - [Header-Only Routing for v1.2.1](#header-only-routing-for-v121)
+

 ## Prerequisites

@@ -34,19 +49,22 @@ Currently, these setups are only supported with the kGateway based Inference Gat
 First, deploy an inference gateway service. In this example, we'll install `kgateway` based gateway implementation.

 ```bash
-./install_gaie_crd_kgateway.sh
+cd deploy/inference-gateway
+./scripts/install_gaie_crd_kgateway.sh
 ```
+**Note**: The manifest at `config/manifests/gateway/kgateway/gateway.yaml` uses `gatewayClassName: agentgateway`, but kGateway's helm chart creates a GatewayClass named `kgateway`. The patch command in the script fixes this mismatch.

-Verify installation:
+#### f. Verify the Gateway is running

 ```bash
-kubectl get gateway inference-gateway -n my-model
+kubectl get gateway inference-gateway

 # Sample output
 # NAME                CLASS      ADDRESS   PROGRAMMED   AGE
-# inference-gateway   kgateway   x.x.x.x   True         1m
+# inference-gateway   kgateway             True         1m
 ```

+
 ### 3. Deploy Your Model ###

 Follow the steps in [model deployment](../../examples/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace.
@@ -54,7 +72,8 @@ Follow the steps in [model deployment](../../examples/backends/vllm/deploy/READM
 Sample commands to deploy model:

 ```bash
-cd <dynamo-source-root>/examples/backends/vllm/deploy
+cd <dynamo-source-root>
+cd examples/backends/vllm/deploy
 kubectl apply -f agg.yaml -n my-model
 ```

@@ -83,14 +102,42 @@ Create a model configuration file similar to the vllm_agg_qwen.yaml for your mod
 This file demonstrates the values needed for the Vllm Agg setup in [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml)
 Take a note of the model's block size provided in the model card.

-### 4. Install Dynamo GAIE helm chart ###
+### 4. Build EPP image
+
+You can either use the provided Dynamo FrontEnd image for the EPP image or you need to build your own Dynamo EPP custom image following the steps below.
+
+```bash
+# export env vars
+export DOCKER_SERVER=ghcr.io/nvidia/dynamo	# Container registry
+export IMAGE_TAG=YOUR-TAG # Or auto from git tag
+cd deploy/inference-gateway/epp
+make all # Do everything in one command
+# or make all-push to also push
+
+
+# Or step-by-step
+make dynamo-lib # Build Dynamo library and copy to project
+make image-load # Build Docker image and load locally
+make image-push # Build and push to registry
+make info # Check image tag
+```
+
+#### All-in-one Targets
+
+| Target | Description |
+|--------|-------------|
+| `make dynamo-lib` | Build Dynamo static library and copy to project |
+| `make all` | Build Dynamo lib + Docker image + load locally |
+| `make all-push` | Build Dynamo lib + Docker image + push to registry |
+
+### 5. Install Dynamo GAIE helm chart ###

 The Inference Gateway is configured through the `inference-gateway-resources.yaml` file.

 Deploy the Inference Gateway resources to your Kubernetes cluster by running the command below.

 ```bash
-cd deploy/inference-gateway
+cd deploy/inference-gateway/

 # Export the Dynamo image you have used when deploying your model in Step 3.
 export DYNAMO_IMAGE=<the-dynamo-image-you-have-used-when-deploying-the-model>
@@ -122,7 +169,7 @@ You can configure the plugin by setting environment vars in your [values-dynamo-

 - Overwrite the `DYN_NAMESPACE` env var if needed to match your model's dynamo namespace.
 - Set `DYNAMO_BUSY_THRESHOLD` to configure the upper bound on how “full” a worker can be (often derived from kv_active_blocks or other load metrics) before the router skips it. If the selected worker exceeds this value, routing falls back to the next best candidate. By default the value is negative meaning this is not enabled.
- Set `DYNAMO_ROUTER_REPLICA_SYNC=true` to enable a background watcher to keep multiple router instances in sync (important if you run more than one KV router per component).
+- Set `DYNAMO_ENFORCE_DISAGG=true` if you want to enforce every request being served in the disaggregated manner. By default it is false meaning if the the prefill worker is not available the request will be served in the aggregated manner.
 - By default the Dynamo plugin uses KV routing. You can expose `DYNAMO_USE_KV_ROUTING=false`  in your [values-dynamo-epp.yaml] if you prefer to route in the round-robin fashion.
 - If using kv-routing:
  - Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-dynamo-epp.yaml](./values-dynamo-epp.yaml) to match your model's block size.The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures.
@@ -132,52 +179,25 @@ You can configure the plugin by setting environment vars in your [values-dynamo-
  - See the [KV cache routing design](../../docs/router/kv_cache_routing.md) for details.


-
-Dynamo provides a custom routing plugin `pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go` to perform efficient kv routing.
-The Dynamo router is built as a static library, the EPP router will call to provide fast inference.
-You can either use the special FrontEnd image for the EPP_IMAGE in the Helm deployment command and proceed to the step 2 or you can build the image yourself following the steps below.
-
-##### 1. Build the custom EPP image #####
-
-If you choose to build your own image, use the `container/build.sh` script with the `--target frontend` option:
-
-```bash
-./container/build.sh --framework none --target frontend
-```
-
-This command automatically:
- Clones the Gateway API Inference Extension (GAIE) repository at the correct version
- Builds the Dynamo Router static library
- Applies the necessary patches to the EPP codebase
- Builds the custom EPP image with Dynamo KV routing support
- Builds the frontend image with the EPP binary and Dynamo runtime components
-
-Re-tag the freshly built image and push it to your registry:
-
-```bash
-docker images
-docker tag <your-new-id> <your-image-tag>
-docker push <your-image-tag>
-```
-
 **Note**
-You can also use the standard EPP image`us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.4.0`. For the basic black box integration run:
+You can also use the standard EPP image i.e. `us-central1-docker.pkg.dev/k8s-artifacts-prod/images/gateway-api-inference-extension/epp:v1.2.1` for the basic black box integration.

 ```bash
 cd deploy/inference-gateway
+helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml
+
 # Optionally export the standard EPP image if you do not want to use the default we suggest.
 export EPP_IMAGE=us-central1-docker.pkg.dev/k8s-artifacts-prod/images/gateway-api-inference-extension/epp:v0.4.0
-helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set epp.useDynamo=false
+helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set epp.useDynamo=false --set-string extension.image=$EPP_IMAGE
 # Optionally overwrite the image --set-string extension.image=$EPP_IMAGE
 ```

-### 5. Verify Installation ###
+### 6. Verify Installation ###

 Check that all resources are properly deployed:

 ```bash
 kubectl get inferencepool
-kubectl get inferencemodel
 kubectl get httproute
 kubectl get service
 kubectl get gateway
@@ -190,16 +210,12 @@ Sample output:
 NAME        AGE
 qwen-pool   33m

-# kubectl get inferencemodel
-NAME         MODEL NAME        INFERENCE POOL   CRITICALITY   AGE
-qwen-model   Qwen/Qwen3-0.6B   qwen-pool        Critical      33m
-
 # kubectl get httproute
 NAME        HOSTNAMES   AGE
 qwen-route               33m
 ```

-### 6. Usage ###
+### 7. Usage ###

 The Inference Gateway provides HTTP endpoints for model inference.

@@ -310,11 +326,56 @@ Sample inference output:
 }
 ```

-### 7. Deleting the installation ###
+### 8. Deleting the installation ###

 If you need to uninstall run:

 ```bash
 kubectl delete dynamoGraphDeployment vllm-agg
 helm uninstall dynamo-gaie -n my-model
+
+# To uninstall GAIE
+# 1. Delete the inference-gateway
+kubectl delete gateway inference-gateway --ignore-not-found
+
+# 2. Uninstall kgateway helm releases
+helm uninstall kgateway -n kgateway-system
+helm uninstall kgateway-crds -n kgateway-system
+
+# 3. Delete the kgateway-system namespace (optional, cleans up everything in it)
+helm uninstall kgateway --namespace kgateway-system
+kubectl delete namespace kgateway-system --ignore-not-found
+
+# 4. Delete the Inference Extension CRDs
+IGW_LATEST_RELEASE=v1.2.1
+kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${IGW_LATEST_RELEASE}/manifests.yaml --ignore-not-found
+
+# 5. Delete the Gateway API CRDs
+GATEWAY_API_VERSION=v1.4.1
+kubectl delete -f https://github.com/kubernetes-sigs/gateway-api/releases/download/$GATEWAY_API_VERSION/standard-install.yaml --ignore-not-found
 ```
+
+## Gateway API Inference Extension Integration
+
+This section documents the updated plugin implementation for Gateway API Inference Extension **v1.2.1**.
+
+### v1.2.1 API Changes
+
+
+### Building for v1.2.1
+
+The plugin code for v1.2.1 is in:
+- `pkg/plugins/dynamo_kv_scorer/plugin.go`
+
+
+### Header-Only Routing for v1.2.1
+
+In v1.2.1, the EPP uses a **header-only approach** for communicating routing decisions.
+The plugins set HTTP headers that are forwarded to the backend workers.
+
+#### Headers Set by Dynamo Plugins
+
+| Header | Description | Set By |
+|--------|-------------|--------|
+| `x-worker-instance-id` | Primary worker ID (decode worker in disagg mode) | kv-aware-scorer |
+| `x-prefill-instance-id` | Prefill worker ID (disaggregated mode only) | kv-aware-scorer |
--- a/deploy/inference-gateway/build-epp-dynamo.sh
+++ b/deploy/inference-gateway/build-epp-dynamo.sh
-#!/usr/bin/env bash
-
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e  # Exit on any error
-
-# Configuration - Set these environment variables before running
-if [[ -z "${DYNAMO_DIR}" ]]; then
-    echo "DYNAMO_DIR environment variable must be set"
-    echo "   Example: export DYNAMO_DIR=/path/to/dynamo"
-    exit 1
-fi
-
-if [[ -z "${GAIE_DIR}" ]]; then
-    echo "GAIE_DIR environment variable must be set"
-    echo "   Example: export GAIE_DIR=/path/to/gateway-api-inference-extension"
-    exit 1
-fi
-DYNAMO_LIB_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib"
-DYNAMO_INCLUDE_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/include"
-
-echo "Building Dynamo KV Router C Library..."
-
-# Step 1: Build the static library
-echo "Building static library..."
-cd "${DYNAMO_DIR}"
-cargo build --release -p libdynamo_llm
-
-# Step 2: Generate header file (with fallback)
-echo "Generating C header..."
-HEADER_OUTPUT="${DYNAMO_DIR}/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h"
-
-if ! cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm --output "${HEADER_OUTPUT}"; then
-    echo "cbindgen failed, using fallback header..."
-    cp "${DYNAMO_DIR}/lib/bindings/c/src/fallback_header.h" "${HEADER_OUTPUT}"
-fi
-
-# Step 3: Ensure directories exist
-echo "Preparing directories..."
-mkdir -p "${DYNAMO_LIB_DIR}"
-mkdir -p "${DYNAMO_INCLUDE_DIR}"
-
-# Step 4: Copy files to GAIE project
-echo "Copying files to the GAIE project..."
-cp "${HEADER_OUTPUT}" "${DYNAMO_INCLUDE_DIR}/"
-cp "${DYNAMO_DIR}/target/release/libdynamo_llm_capi.a" "${DYNAMO_LIB_DIR}/"
-cp "${DYNAMO_DIR}/container/Dockerfile.epp" "${GAIE_DIR}/Dockerfile.dynamo"
-
-# Verify files were copied
-if [[ ! -f "${DYNAMO_INCLUDE_DIR}/llm_engine.h" ]]; then
-    echo "Header file copy failed!"
-    exit 1
-fi
-
-if [[ ! -f "${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a" ]]; then
-    echo "Library file copy failed!"
-    exit 1
-fi
-
-if [[ ! -f "${GAIE_DIR}/Dockerfile.dynamo" ]]; then
-    echo "Docker.dynamo file copy failed!"
-    exit 1
-fi
-
-echo "Files copied successfully:"
-echo "   Header: ${DYNAMO_INCLUDE_DIR}/llm_engine.h"
-echo "   Library: ${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a"
-echo "   Docker: ${GAIE_DIR}/Dockerfile.epp"
-
-# Step 5: Apply Dynamo patch (if it exists)
-echo "Applying Dynamo patch..."
-cd "${GAIE_DIR}"
-
-PATCH_FILE="${DYNAMO_DIR}/deploy/inference-gateway/epp-patches/v0.8.0/gaie.patch"
-if [[ -f "${PATCH_FILE}" ]]; then
-    if git apply --check "${PATCH_FILE}" 2>/dev/null; then
-        git apply "${PATCH_FILE}"
-        echo "Patch applied successfully"
-    else
-        echo "Patch doesn't apply cleanly - may already be applied or need manual resolution"
-    fi
-else
-    echo "No patch file found at ${PATCH_FILE}"
-fi
-
-# Step 6: Build the EPP image
-echo "Building the custom EPP image for GAIE..."
-
-# Build make args - pass DOCKER_PROXY if set (e.g., from ECR_HOSTNAME)
-MAKE_ARGS=""
-if [[ -n "${DOCKER_PROXY}" ]]; then
-    echo "Using DOCKER_PROXY: ${DOCKER_PROXY}"
-    MAKE_ARGS+="DOCKER_PROXY=${DOCKER_PROXY} "
-fi
-
-make ${MAKE_ARGS} dynamo-image-local-load
-
-echo "EPP image with Dynamo KV routing built"
--- a/deploy/inference-gateway/epp-patches/v0.5.1-1/epp-v0.5.1-dyn1.patch
+++ b/deploy/inference-gateway/epp-patches/v0.5.1-1/epp-v0.5.1-dyn1.patch
-diff --git a/cmd/epp/main.go b/cmd/epp/main.go
-index b5e0617..8592735 100644
--- a/cmd/epp/main.go
-+++ b/cmd/epp/main.go
-@@ -22,6 +22,11 @@ import (
- 	ctrl "sigs.k8s.io/controller-runtime"
- 
- 	"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
-+	eppplugins "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+
-+	// Dynamo plugins
-+	dynprereq "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid"
-+	dynscorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/dynamo_kv_scorer"
- )
- 
- func main() {
-@@ -30,6 +35,9 @@ func main() {
- 	// For adding out-of-tree plugins to the plugins registry, use the following:
- 	// plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function)
- 
-+	eppplugins.Register("dynamo-inject-workerid", dynprereq.InjectWorkerIDPreRequestFactory)
-+	eppplugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
-+
- 	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
- 		os.Exit(1)
- 	}
-diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go
-index 32fffc0..1aa1b85 100644
--- a/pkg/bbr/handlers/request.go
-+++ b/pkg/bbr/handlers/request.go
-@@ -18,8 +18,10 @@ package handlers
- 
- import (
- 	"context"
-+	"encoding/base64"
- 	"encoding/json"
- 	"fmt"
-+	"strings"
- 
- 	basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
- 	eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
-@@ -31,11 +33,49 @@ import (
- 
- const modelHeader = "X-Gateway-Model-Name"
- 
-+// Dynamo-related
-+const (
-+	workerIDHeader   = "x-worker-instance-id"
-+	injectHintHeader = "x-epp-inject-nvext-worker-instance-id"
-+	tokenDataHeader  = "x-epp-inject-nvext-token-data"
-+)
-+
- // HandleRequestBody handles request bodies.
- func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]*eppb.ProcessingResponse, error) {
- 	logger := log.FromContext(ctx)
- 	var ret []*eppb.ProcessingResponse
- 
-+	// If we captured a worker id hint in the headers phase, inject it into body JSON:
-+	// nvext.backend_instance_id = <workerID>
-+	if wid := strings.TrimSpace(s.workerIDHint); wid != "" {
-+		// ensure nvext is a map[string]any
-+		if nv, ok := data["nvext"]; !ok || nv == nil {
-+			data["nvext"] = map[string]any{"backend_instance_id": wid}
-+		} else if m, ok := nv.(map[string]any); ok {
-+			m["backend_instance_id"] = wid
-+		} else {
-+			// if nvext was some other type, replace with a clean map
-+			data["nvext"] = map[string]any{"backend_instance_id": wid}
-+		}
-+	}
-+
-+	// If we captured token_data in headers, decode and inject as nvext.token_data
-+	if td := strings.TrimSpace(s.tokenDataHint); td != "" {
-+		// header value is base64(JSON array)
-+		if raw, err := base64.StdEncoding.DecodeString(td); err == nil {
-+			var arr []int64
-+			if err := json.Unmarshal(raw, &arr); err == nil && len(arr) > 0 {
-+				// ensure nvext map exists
-+				nv, ok := data["nvext"].(map[string]any)
-+				if !ok || nv == nil {
-+					nv = map[string]any{}
-+					data["nvext"] = nv
-+				}
-+				nv["token_data"] = arr
-+			}
-+		}
-+	}
-+
- 	requestBodyBytes, err := json.Marshal(data)
- 	if err != nil {
- 		return nil, err
-@@ -46,6 +86,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 		metrics.RecordModelNotInBodyCounter()
- 		logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter")
- 		if s.streaming {
-+			// still stream the possibly mutated body
- 			ret = append(ret, &eppb.ProcessingResponse{
- 				Response: &eppb.ProcessingResponse_RequestHeaders{
- 					RequestHeaders: &eppb.HeadersResponse{},
-@@ -53,14 +94,24 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 			})
- 			ret = addStreamedBodyResponse(ret, requestBodyBytes)
- 			return ret, nil
-		} else {
-			ret = append(ret, &eppb.ProcessingResponse{
-+		}
-+
-+		// non-streaming: return a body response with the (possibly) mutated body
-+		return []*eppb.ProcessingResponse{
-+			{
- 				Response: &eppb.ProcessingResponse_RequestBody{
-					RequestBody: &eppb.BodyResponse{},
-+					RequestBody: &eppb.BodyResponse{
-+						Response: &eppb.CommonResponse{
-+							BodyMutation: &eppb.BodyMutation{
-+								Mutation: &eppb.BodyMutation_Body{
-+									Body: requestBodyBytes,
-+								},
-+							},
-+						},
-+					},
- 				},
-			})
-		}
-		return ret, nil
-+			},
-+		}, nil
- 	}
- 
- 	modelStr, ok := modelVal.(string)
-@@ -73,6 +124,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 	metrics.RecordSuccessCounter()
- 
- 	if s.streaming {
-+		// set the model header, then stream the (possibly) mutated body
- 		ret = append(ret, &eppb.ProcessingResponse{
- 			Response: &eppb.ProcessingResponse_RequestHeaders{
- 				RequestHeaders: &eppb.HeadersResponse{
-@@ -86,16 +138,42 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 										RawValue: []byte(modelStr),
- 									},
- 								},
-+								// also keep the worker id header if we have one
-+								func() *basepb.HeaderValueOption {
-+									if strings.TrimSpace(s.workerIDHint) == "" {
-+										return nil
-+									}
-+									return &basepb.HeaderValueOption{
-+										Header: &basepb.HeaderValue{
-+											Key:      workerIDHeader,
-+											RawValue: []byte(s.workerIDHint),
-+										},
-+									}
-+								}(),
- 							},
- 						},
- 					},
- 				},
- 			},
- 		})
-+
-+		// prune nil entries if worker id not present
-+		hm := ret[len(ret)-1].GetRequestHeaders().GetResponse().GetHeaderMutation()
-+		if hm != nil && hm.SetHeaders != nil {
-+			out := hm.SetHeaders[:0]
-+			for _, h := range hm.SetHeaders {
-+				if h != nil {
-+					out = append(out, h)
-+				}
-+			}
-+			hm.SetHeaders = out
-+		}
-+
- 		ret = addStreamedBodyResponse(ret, requestBodyBytes)
- 		return ret, nil
- 	}
- 
-+	// Non-streaming: set model header and replace the body with our mutated JSON
- 	return []*eppb.ProcessingResponse{
- 		{
- 			Response: &eppb.ProcessingResponse_RequestBody{
-@@ -111,6 +189,22 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 										RawValue: []byte(modelStr),
- 									},
- 								},
-+								func() *basepb.HeaderValueOption {
-+									if strings.TrimSpace(s.workerIDHint) == "" {
-+										return nil
-+									}
-+									return &basepb.HeaderValueOption{
-+										Header: &basepb.HeaderValue{
-+											Key:      workerIDHeader,
-+											RawValue: []byte(s.workerIDHint),
-+										},
-+									}
-+								}(),
-+							},
-+						},
-+						BodyMutation: &eppb.BodyMutation{
-+							Mutation: &eppb.BodyMutation_Body{
-+								Body: requestBodyBytes,
- 							},
- 						},
- 					},
-@@ -141,6 +235,32 @@ func addStreamedBodyResponse(responses []*eppb.ProcessingResponse, requestBodyBy
- 
- // HandleRequestHeaders handles request headers.
- func (s *Server) HandleRequestHeaders(headers *eppb.HttpHeaders) ([]*eppb.ProcessingResponse, error) {
-+	// reset per-request
-+	s.workerIDHint = ""
-+	s.tokenDataHint = ""
-+
-+	if m := headers.GetHeaders(); m != nil {
-+		for _, h := range m.GetHeaders() {
-+			k := strings.ToLower(h.GetKey())
-+
-+			switch k {
-+			case injectHintHeader, workerIDHeader:
-+				if rv := h.GetRawValue(); len(rv) > 0 {
-+					s.workerIDHint = strings.TrimSpace(string(rv))
-+				} else {
-+					s.workerIDHint = strings.TrimSpace(h.GetValue())
-+				}
-+			case tokenDataHeader:
-+				if rv := h.GetRawValue(); len(rv) > 0 {
-+					s.tokenDataHint = strings.TrimSpace(string(rv))
-+				} else {
-+					s.tokenDataHint = strings.TrimSpace(h.GetValue())
-+				}
-+			}
-+		}
-+	}
-+
-+	// No header mutations needed here; body phase will do the JSON injection.
- 	return []*eppb.ProcessingResponse{
- 		{
- 			Response: &eppb.ProcessingResponse_RequestHeaders{
-diff --git a/pkg/bbr/handlers/server.go b/pkg/bbr/handlers/server.go
-index a580380..eb2893f 100644
--- a/pkg/bbr/handlers/server.go
-+++ b/pkg/bbr/handlers/server.go
-@@ -38,7 +38,9 @@ func NewServer(streaming bool) *Server {
- // Server implements the Envoy external processing server.
- // https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto
- type Server struct {
-	streaming bool
-+	streaming     bool
-+	workerIDHint  string
-+	tokenDataHint string
- }
- 
- func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
-diff --git a/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
-new file mode 100644
-index 0000000..b6708fa
--- /dev/null
-+++ b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
-@@ -0,0 +1,69 @@
-+package dynamo_inject_workerid
-+
-+import (
-+	"context"
-+	"encoding/json"
-+	"strings"
-+
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+	rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
-+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-+)
-+
-+const (
-+	typeString       = "dynamo-inject-workerid"
-+	pluginName       = "dynamo-inject-workerid"
-+	WorkerIDHeader   = "x-worker-instance-id"
-+	injectHintHeader = "x-epp-inject-nvext-worker-instance-id"
-+	TokenDataHeader  = "x-epp-inject-nvext-token-data"
-+)
-+
-+var _ plugins.Plugin = (*InjectWorkerIDPreRequest)(nil)
-+var _ rc.PreRequest = (*InjectWorkerIDPreRequest)(nil)
-+
-+type InjectWorkerIDPreRequest struct {
-+	typedName plugins.TypedName
-+}
-+
-+func NewInjectWorkerIDPreRequest() *InjectWorkerIDPreRequest {
-+	return &InjectWorkerIDPreRequest{
-+		typedName: plugins.TypedName{Type: typeString, Name: pluginName},
-+	}
-+}
-+
-+func (p *InjectWorkerIDPreRequest) WithName(name string) *InjectWorkerIDPreRequest {
-+	p.typedName.Name = name
-+	return p
-+}
-+
-+func InjectWorkerIDPreRequestFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
-+	return NewInjectWorkerIDPreRequest().WithName(name), nil
-+}
-+
-+func (p *InjectWorkerIDPreRequest) TypedName() plugins.TypedName { return p.typedName }
-+
-+func (p *InjectWorkerIDPreRequest) PreRequest(
-+	_ context.Context,
-+	req *schedtypes.LLMRequest,
-+	_ *schedtypes.SchedulingResult,
-+	_ int,
-+) {
-+	if req == nil {
-+		return
-+	}
-+	if req.Headers == nil {
-+		req.Headers = map[string]string{}
-+	}
-+	wid := strings.TrimSpace(req.Headers[WorkerIDHeader])
-+	if wid == "" {
-+		return
-+	}
-+	req.Headers[WorkerIDHeader] = wid
-+	req.Headers[injectHintHeader] = wid
-+
-+	// Pass through token-data header if scorer set it
-+	if td := strings.TrimSpace(req.Headers[TokenDataHeader]); td != "" {
-+		req.Headers[TokenDataHeader] = td
-+	}
-+
-+}
-diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
-new file mode 100644
-index 0000000..2d92be0
--- /dev/null
-+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
-@@ -0,0 +1,24 @@
-+# This is an example for configuring the EPP to use the dynamo token-aware kv router for scoring the pods
-+apiVersion: inference.networking.x-k8s.io/v1alpha1
-+kind: EndpointPickerConfig
-+plugins:
-+  # Required: tells EPP which profile to use (even if you only have one)
-+  - type: single-profile-handler
-+
-+  # Picker: chooses the final endpoint after scoring
-+  - name: picker
-+    type: max-score-picker
-+  - name: dyn-pre
-+    type: dynamo-inject-workerid
-+    parameters: {}
-+  - name: dyn-kv
-+    type: kv-aware-scorer
-+    parameters:
-+      frontendURL: http://127.0.0.1:8000/v1/chat/completions
-+      timeoutMS: 10000
-+schedulingProfiles:
-+  - name: default
-+    plugins:
-+      - pluginRef: dyn-kv
-+        weight: 1
-+      - pluginRef: picker
-diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
-new file mode 100644
-index 0000000..50eb5f6
--- /dev/null
-+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
-@@ -0,0 +1,431 @@
-+package dynamo_kv_scorer
-+
-+import (
-+	"bufio"
-+	"bytes"
-+	"context"
-+	"encoding/base64"
-+	"encoding/json"
-+	"fmt"
-+	"io"
-+	"net/http"
-+	"strings"
-+	"time"
-+
-+	log "sigs.k8s.io/controller-runtime/pkg/log"
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
-+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-+)
-+
-+const (
-+	PluginName               = "dynamo-kv-scorer"
-+	KVAwareScorerType        = "kv-aware-scorer"
-+	StateKeyWorkerInstanceID = schedtypes.StateKey("dynamo/worker-instance-id")
-+	WorkerIDHeader           = "x-worker-instance-id"
-+	TokenDataHeader          = "x-epp-inject-nvext-token-data"
-+)
-+
-+type params struct {
-+	FrontendURL string `json:"frontendURL"`
-+	TimeoutMS   int    `json:"timeoutMS"`
-+}
-+
-+// tiny wrapper so we can store a string in CycleState
-+type stateString string
-+
-+func (s stateString) Clone() schedtypes.StateData { return s }
-+
-+type KVAwareScorer struct {
-+	typedName plugins.TypedName
-+	feURL     string
-+	feTimeout time.Duration
-+}
-+
-+// compile-time assertions
-+var _ plugins.Plugin = (*KVAwareScorer)(nil)
-+var _ framework.Scorer = (*KVAwareScorer)(nil)
-+
-+func NewKVAwareScorer() *KVAwareScorer {
-+	return &KVAwareScorer{
-+		typedName: plugins.TypedName{Type: KVAwareScorerType, Name: PluginName},
-+		feURL:     "http://127.0.0.1:8000/v1/chat/completions",
-+		feTimeout: 10 * time.Second,
-+	}
-+}
-+
-+func (k *KVAwareScorer) WithName(name string) *KVAwareScorer { k.typedName.Name = name; return k }
-+func (k *KVAwareScorer) WithFrontend(url string, timeout time.Duration) *KVAwareScorer {
-+	if url != "" {
-+		k.feURL = url
-+	}
-+	if timeout > 0 {
-+		k.feTimeout = timeout
-+	}
-+	return k
-+}
-+
-+func KVAwareScorerFactory(name string, raw json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
-+	p := params{}
-+	_ = json.Unmarshal(raw, &p)
-+	timeout := time.Duration(p.TimeoutMS) * time.Millisecond
-+	if timeout <= 0 {
-+		timeout = 10 * time.Second
-+	}
-+	return NewKVAwareScorer().WithName(name).WithFrontend(p.FrontendURL, timeout), nil
-+}
-+
-+func (k *KVAwareScorer) TypedName() plugins.TypedName { return k.typedName }
-+
-+func (k *KVAwareScorer) Score(
-+	ctx context.Context,
-+	cycle *schedtypes.CycleState,
-+	req *schedtypes.LLMRequest,
-+	pods []schedtypes.Pod,
-+) map[schedtypes.Pod]float64 {
-+	logger := log.FromContext(ctx)
-+
-+	workerID, tokenData, err := k.callFrontEndForWorker(ctx, req)
-+	if err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "FrontEnd call failed; proceeding without worker id")
-+	} else if workerID != "" {
-+		cycle.Write(StateKeyWorkerInstanceID, stateString(workerID))
-+		if req.Headers == nil {
-+			req.Headers = map[string]string{}
-+		}
-+		req.Headers[WorkerIDHeader] = workerID
-+		if len(tokenData) > 0 {
-+			if req.Headers == nil {
-+				req.Headers = map[string]string{}
-+			}
-+			req.Headers[TokenDataHeader] = encodeTokenData(tokenData)
-+		}
-+	}
-+
-+	// neutral/uniform scores – only your scorer runs in the profile, so this “wins”
-+	out := make(map[schedtypes.Pod]float64, len(pods))
-+	for _, p := range pods {
-+		out[p] = 1.0
-+	}
-+	return out
-+}
-+
-+// Call the Dynamo FrontEnd and extract worker_instance_id via SSE.
-+func (k *KVAwareScorer) callFrontEndForWorker(
-+	ctx context.Context,
-+	req *schedtypes.LLMRequest,
-+) (string, []int64, error) {
-+	logger := log.FromContext(ctx)
-+
-+	feBody := buildFrontEndBodyFromLLMRequest(req)
-+	payload, err := json.Marshal(feBody)
-+	if err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "Dynamo FrontEnd marshal failed")
-+		return "", nil, fmt.Errorf("marshal FrontEnd body: %w", err)
-+	}
-+
-+	reqCtx, cancel := context.WithTimeout(ctx, k.feTimeout)
-+	defer cancel()
-+
-+	httpReq, err := http.NewRequestWithContext(reqCtx, http.MethodPost, k.feURL, bytes.NewReader(payload))
-+	if err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "Dynamo FrontEnd request build failed")
-+		return "", nil, fmt.Errorf("build FrontEnd request: %w", err)
-+	}
-+	httpReq.Header.Set("Content-Type", "application/json")
-+	httpReq.Header.Set("Accept", "text/event-stream")
-+
-+	client := &http.Client{Timeout: 0}
-+	resp, err := client.Do(httpReq)
-+	if err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "Dynamo FrontEnd POST failed")
-+		return "", nil, fmt.Errorf("FrontEnd POST failed: %w", err)
-+	}
-+	defer resp.Body.Close()
-+
-+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
-+		errBody, _ := io.ReadAll(resp.Body)
-+		logger.V(logutil.DEFAULT).Error(nil, "Dynamo FrontEnd non-2xx response",
-+			"status_code", resp.StatusCode, "response_body", string(errBody))
-+		return "", nil, fmt.Errorf("Dynamo FrontEnd error: %d body=%s", resp.StatusCode, string(errBody))
-+	}
-+
-+	ct := strings.ToLower(resp.Header.Get("Content-Type"))
-+	if !strings.Contains(ct, "text/event-stream") {
-+		logger.V(logutil.DEFAULT).Error(nil, "Unexpected non-SSE response")
-+		return "", nil, fmt.Errorf("unexpected non-SSE response (Content-Type=%q)", resp.Header.Get("Content-Type"))
-+	}
-+
-+	// Parse SSE: expect `event: worker_instance_id`, a quoted id in a comment or data, and `data: [DONE]`
-+	reader := bufio.NewReader(resp.Body)
-+	workerID, tokenData, perr := parseSelectionFromSSE(ctx, reader)
-+	if perr != nil {
-+		return "", nil, perr
-+	}
-+	return workerID, tokenData, nil
-+}
-+
-+// Build the exact body we send to the FrontEnd, only from LLMRequest (no header merging).
-+func buildFrontEndBodyFromLLMRequest(req *schedtypes.LLMRequest) map[string]any {
-+	feBody := make(map[string]any, 8)
-+
-+	// We call /v1/chat/completions so must provide messages
-+	userText := ""
-+	if req != nil && strings.TrimSpace(req.Prompt) != "" {
-+		userText = req.Prompt
-+	}
-+	feBody["messages"] = []map[string]any{
-+		{"role": "user", "content": userText},
-+	}
-+
-+	if req != nil && strings.TrimSpace(req.TargetModel) != "" {
-+		feBody["model"] = req.TargetModel
-+	}
-+
-+	// Force SSE so we can parse worker_instance_id
-+	feBody["stream"] = true
-+
-+	feBody["max_tokens"] = 1
-+	feBody["temperature"] = 0.0
-+
-+	// Ask the Dynamo to include worker id
-+	feBody["nvext"] = map[string]any{
-+		"annotations": []string{"query_instance_id"},
-+	}
-+
-+	return feBody
-+}
-+
-+// This function scans an SSE stream for a worker_instance_id and token_data.
-+// Expected pattern:
-+//
-+//	event: worker_instance_id
-+//	: "8303679623149182543"
-+//	data: [DONE]
-+
-+// or with tokens:
-+// event: worker_instance_id\n: \"8228244551594056720\"\n\n
-+// event: token_data\n: \"[151644,872,198,151644,872,198,14990,151645,198,151645,198,151644,77091,198]\
-+// "\n\ndata: [DONE]\n\n"
-+// Also supports JSON in data lines with either top-level worker_instance_id
-+// or annotations.worker_instance_id.
-+func parseSelectionFromSSE(ctx context.Context, reader *bufio.Reader) (string, []int64, error) {
-+	logger := log.FromContext(ctx)
-+
-+	var (
-+		eventName  string
-+		dataBuf    strings.Builder // accumulates "data:" lines for one event
-+		commentBuf strings.Builder // accumulates ":" comment lines
-+		gotWID     string
-+		gotTD      []int64
-+	)
-+
-+	// collect the exact SSE bytes for debugging
-+	var rawBuf strings.Builder
-+
-+	flushEvent := func() (bool, error) {
-+		data := strings.TrimSpace(dataBuf.String())
-+		comment := strings.TrimSpace(commentBuf.String())
-+		dataBuf.Reset()
-+		commentBuf.Reset()
-+
-+		// [DONE] ends the stream
-+		if data == "[DONE]" || comment == "[DONE]" {
-+			logger.V(logutil.DEFAULT).Info("SSE stream DONE")
-+			logger.V(logutil.DEFAULT).Info("SSE raw stream", "raw", rawBuf.String())
-+			if gotWID != "" && len(gotTD) == 0 {
-+				logger.V(logutil.DEFAULT).Info("SSE DONE: worker_instance_id present, token_data missing")
-+			}
-+			return true, nil
-+		}
-+
-+		// Prefer the named event
-+		if eventName == "worker_instance_id" {
-+			candidate := data
-+			if candidate == "" {
-+				candidate = comment
-+			}
-+			if candidate != "" {
-+				// Try JSON string
-+				var s string
-+				if json.Unmarshal([]byte(candidate), &s) == nil && s != "" {
-+					logger.V(logutil.VERBOSE).Info("worker_instance_id extracted from named event", "worker_instance_id", s)
-+					gotWID = s
-+					return false, nil
-+				}
-+				// Fallback: strip quotes
-+				clean := strings.Trim(candidate, "\"")
-+				if clean != "" && clean != "[DONE]" {
-+					logger.V(logutil.DEFAULT).Info("worker_instance_id extracted (raw) from named event", "worker_instance_id", clean)
-+					gotWID = clean
-+					return false, nil
-+				}
-+			}
-+		}
-+
-+		if eventName == "token_data" {
-+			candidate := data
-+			if candidate == "" {
-+				candidate = comment
-+			}
-+			if candidate != "" {
-+				if arr := toInt64SliceJSON(candidate); len(arr) > 0 {
-+					gotTD = arr
-+					logger.V(logutil.DEFAULT).Info("token_data extracted from named event", "count", len(arr))
-+					return false, nil
-+				}
-+			}
-+		}
-+		// Generic JSON in data:
-+		if data != "" {
-+			var msg map[string]any
-+			if json.Unmarshal([]byte(data), &msg) == nil {
-+				if wid, ok := msg["worker_instance_id"].(string); ok && wid != "" {
-+					logger.V(logutil.DEFAULT).Info("worker_instance_id found in SSE payload root", "worker_instance_id", wid)
-+					gotWID = wid
-+				}
-+				if ann, ok := msg["annotations"].(map[string]any); ok {
-+					if wid, ok := ann["worker_instance_id"].(string); ok && wid != "" {
-+						logger.V(logutil.DEFAULT).Info("worker_instance_id found in SSE annotations", "worker_instance_id", wid)
-+						gotWID = wid
-+					}
-+				}
-+				if td, ok := msg["token_data"]; ok {
-+					if arr := toInt64Slice(td); len(arr) > 0 {
-+						gotTD = arr
-+						logger.V(logutil.DEFAULT).Info("token_data found in SSE payload root", "count", len(arr))
-+					}
-+				} else if nv, ok := msg["nvext"].(map[string]any); ok {
-+					if td, ok := nv["token_data"]; ok {
-+						if arr := toInt64Slice(td); len(arr) > 0 {
-+							gotTD = arr
-+							logger.V(logutil.DEFAULT).Info("token_data found in SSE nvext", "count", len(arr))
-+						}
-+					}
-+				}
-+			}
-+		}
-+		return false, nil
-+	}
-+
-+	for {
-+		line, err := reader.ReadString('\n')
-+		// capture the raw stream as-is for debugging
-+		rawBuf.WriteString(line)
-+		if err != nil {
-+			if err == io.EOF {
-+				_, _ = flushEvent()
-+				logger.V(logutil.DEFAULT).Info("SSE raw stream (EOF)", "raw", rawBuf.String())
-+				if gotWID != "" && len(gotTD) == 0 {
-+					logger.V(logutil.DEFAULT).Info("EOF: worker_instance_id present, token_data missing")
-+				}
-+				if gotWID != "" || len(gotTD) > 0 {
-+					return gotWID, gotTD, nil
-+				}
-+				logger.V(logutil.DEFAULT).Error(nil, "EOF before selection fields present")
-+				return "", nil, fmt.Errorf("selection not found in SSE stream (EOF)")
-+			}
-+			logger.V(logutil.DEFAULT).Error(err, "SSE read error")
-+			return "", nil, fmt.Errorf("sse read error: %w", err)
-+		}
-+
-+		l := strings.TrimRight(line, "\r\n")
-+		if l == "" {
-+			// End of current event.
-+			if done, _ := flushEvent(); done {
-+				if gotWID != "" && len(gotTD) == 0 {
-+					logger.V(logutil.DEFAULT).Info("SSE DONE: worker_instance_id present, token_data missing")
-+				}
-+				return gotWID, gotTD, nil
-+			}
-+			eventName = "" // reset for next event
-+			continue
-+		}
-+
-+		// Comment line
-+		if strings.HasPrefix(l, ":") {
-+			commentLine := strings.TrimSpace(l[1:])
-+			if commentBuf.Len() > 0 {
-+				commentBuf.WriteByte('\n')
-+			}
-+			commentBuf.WriteString(commentLine)
-+			continue
-+		}
-+
-+		// "field: value"
-+		if idx := strings.IndexByte(l, ':'); idx != -1 {
-+			field := l[:idx]
-+			val := strings.TrimSpace(l[idx+1:])
-+			switch field {
-+			case "event":
-+				eventName = val
-+			case "data":
-+				if dataBuf.Len() > 0 {
-+					dataBuf.WriteByte('\n')
-+				}
-+				dataBuf.WriteString(val)
-+			default:
-+				// ignore id, retry, etc.
-+			}
-+		}
-+	}
-+}
-+
-+// encodeTokenData turns []int64 into base64(JSON array) for a safe header value.
-+func encodeTokenData(tokens []int64) string {
-+	b, _ := json.Marshal(tokens)
-+	return base64.StdEncoding.EncodeToString(b)
-+}
-+
-+// Accepts interface{} from a parsed JSON map
-+func toInt64Slice(v any) []int64 {
-+	xs, ok := v.([]any)
-+	if !ok {
-+		return nil
-+	}
-+	out := make([]int64, 0, len(xs))
-+	for _, it := range xs {
-+		switch n := it.(type) {
-+		case float64:
-+			out = append(out, int64(n))
-+		case int64:
-+			out = append(out, n)
-+		case json.Number:
-+			if i, err := n.Int64(); err == nil {
-+				out = append(out, i)
-+			}
-+		}
-+	}
-+	return out
-+}
-+
-+// Accepts raw JSON (string) for events like:
-+// event: worker_instance_id\n: \"8228244551594056720\"\n\n
-+// event: token_data\n: \"[151644,872,198,151644,872,198,14990,151645,198,151645,198,151644,77091,198]\
-+// "\n\ndata: [DONE]\n\n"
-+// replaces the old toInt64SliceJSON
-+func toInt64SliceJSON(s string) []int64 {
-+	// case 1: direct JSON array
-+	var arr []int64
-+	if err := json.Unmarshal([]byte(s), &arr); err == nil && len(arr) > 0 {
-+		return arr
-+	}
-+	// case 2: s is a JSON string that itself contains a JSON array
-+	var inner string
-+	if err := json.Unmarshal([]byte(s), &inner); err == nil && inner != "" {
-+		var arr2 []int64
-+		if err := json.Unmarshal([]byte(inner), &arr2); err == nil && len(arr2) > 0 {
-+			return arr2
-+		}
-+	}
-+	// case 3: strip quotes and try once more
-+	unquoted := strings.Trim(s, "\"")
-+	if unquoted != s {
-+		var arr3 []int64
-+		if err := json.Unmarshal([]byte(unquoted), &arr3); err == nil && len(arr3) > 0 {
-+			return arr3
-+		}
-+	}
-+	return nil
-+}
--- a/deploy/inference-gateway/epp-patches/v0.5.1-2/epp-v0.5.1-dyn2.patch
+++ b/deploy/inference-gateway/epp-patches/v0.5.1-2/epp-v0.5.1-dyn2.patch
-diff --git a/Makefile b/Makefile
-index dee7e99..4679ce2 100644
--- a/Makefile
-+++ b/Makefile
-@@ -170,6 +170,48 @@ verify-all:
- 
- ##@ Build
- 
-+##@ Dynamo EPP with FFI
-+
-+# Build the Dynamo EPP image with CGO static library support
-+.PHONY: dynamo-image-local-build
-+dynamo-image-local-build: ## Build the Dynamo EPP image using Docker Buildx for local development.
-+	BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
-+	$(MAKE) dynamo-image-build PUSH=$(PUSH)
-+	$(MAKE) dynamo-image-build LOAD=$(LOAD)
-+	$(DOCKER_BUILDX_CMD) rm $$BUILDER
-+
-+.PHONY: dynamo-image-local-push
-+dynamo-image-local-push: PUSH=--push ## Build the Dynamo EPP image for local development and push it to $IMAGE_REPO.
-+dynamo-image-local-push: dynamo-image-local-build
-+
-+.PHONY: dynamo-image-local-load
-+dynamo-image-local-load: LOAD=--load ## Build the Dynamo EPP image for local development and load it in the local Docker registry.
-+dynamo-image-local-load: dynamo-image-local-build
-+
-+.PHONY: dynamo-image-build
-+dynamo-image-build: ## Build the Dynamo EPP image using Docker Buildx with CGO support.
-+	$(IMAGE_BUILD_CMD) -f Dockerfile.dynamo -t $(IMAGE_TAG) \
-+		--platform=$(PLATFORMS) \
-+		--build-arg BASE_IMAGE=ubuntu:22.04 \
-+		--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
-+		--build-arg COMMIT_SHA=${GIT_COMMIT_SHA} \
-+		--build-arg BUILD_REF=${BUILD_REF} \
-+		$(PUSH) \
-+		$(LOAD) \
-+		$(IMAGE_BUILD_EXTRA_OPTS) ./
-+
-+.PHONY: dynamo-image-push
-+dynamo-image-push: PUSH=--push ## Build the Dynamo EPP image and push it to $IMAGE_REPO.
-+dynamo-image-push: dynamo-image-build
-+
-+.PHONY: dynamo-image-load
-+dynamo-image-load: LOAD=--load ## Build the Dynamo EPP image and load it in the local Docker registry.
-+dynamo-image-load: dynamo-image-build
-+
-+.PHONY: dynamo-image-kind
-+dynamo-image-kind: dynamo-image-build ## Build the Dynamo EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default).
-+	kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
-+
- # Build the container image
- .PHONY: image-local-build
- image-local-build: ## Build the EPP image using Docker Buildx for local development.
-diff --git a/cmd/epp/main.go b/cmd/epp/main.go
-index b5e0617..8592735 100644
--- a/cmd/epp/main.go
-+++ b/cmd/epp/main.go
-@@ -22,6 +22,11 @@ import (
- 	ctrl "sigs.k8s.io/controller-runtime"
- 
- 	"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
-+	eppplugins "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+
-+	// Dynamo plugins
-+	dynprereq "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid"
-+	dynscorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/dynamo_kv_scorer"
- )
- 
- func main() {
-@@ -30,6 +35,9 @@ func main() {
- 	// For adding out-of-tree plugins to the plugins registry, use the following:
- 	// plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function)
- 
-+	eppplugins.Register("dynamo-inject-workerid", dynprereq.InjectWorkerIDPreRequestFactory)
-+	eppplugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
-+
- 	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
- 		os.Exit(1)
- 	}
-diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go
-index 32fffc0..1aa1b85 100644
--- a/pkg/bbr/handlers/request.go
-+++ b/pkg/bbr/handlers/request.go
-@@ -18,8 +18,10 @@ package handlers
- 
- import (
- 	"context"
-+	"encoding/base64"
- 	"encoding/json"
- 	"fmt"
-+	"strings"
- 
- 	basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
- 	eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
-@@ -31,11 +33,49 @@ import (
- 
- const modelHeader = "X-Gateway-Model-Name"
- 
-+// Dynamo-related
-+const (
-+	workerIDHeader   = "x-worker-instance-id"
-+	injectHintHeader = "x-epp-inject-nvext-worker-instance-id"
-+	tokenDataHeader  = "x-epp-inject-nvext-token-data"
-+)
-+
- // HandleRequestBody handles request bodies.
- func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]*eppb.ProcessingResponse, error) {
- 	logger := log.FromContext(ctx)
- 	var ret []*eppb.ProcessingResponse
- 
-+	// If we captured a worker id hint in the headers phase, inject it into body JSON:
-+	// nvext.backend_instance_id = <workerID>
-+	if wid := strings.TrimSpace(s.workerIDHint); wid != "" {
-+		// ensure nvext is a map[string]any
-+		if nv, ok := data["nvext"]; !ok || nv == nil {
-+			data["nvext"] = map[string]any{"backend_instance_id": wid}
-+		} else if m, ok := nv.(map[string]any); ok {
-+			m["backend_instance_id"] = wid
-+		} else {
-+			// if nvext was some other type, replace with a clean map
-+			data["nvext"] = map[string]any{"backend_instance_id": wid}
-+		}
-+	}
-+
-+	// If we captured token_data in headers, decode and inject as nvext.token_data
-+	if td := strings.TrimSpace(s.tokenDataHint); td != "" {
-+		// header value is base64(JSON array)
-+		if raw, err := base64.StdEncoding.DecodeString(td); err == nil {
-+			var arr []int64
-+			if err := json.Unmarshal(raw, &arr); err == nil && len(arr) > 0 {
-+				// ensure nvext map exists
-+				nv, ok := data["nvext"].(map[string]any)
-+				if !ok || nv == nil {
-+					nv = map[string]any{}
-+					data["nvext"] = nv
-+				}
-+				nv["token_data"] = arr
-+			}
-+		}
-+	}
-+
- 	requestBodyBytes, err := json.Marshal(data)
- 	if err != nil {
- 		return nil, err
-@@ -46,6 +86,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 		metrics.RecordModelNotInBodyCounter()
- 		logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter")
- 		if s.streaming {
-+			// still stream the possibly mutated body
- 			ret = append(ret, &eppb.ProcessingResponse{
- 				Response: &eppb.ProcessingResponse_RequestHeaders{
- 					RequestHeaders: &eppb.HeadersResponse{},
-@@ -53,14 +94,24 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 			})
- 			ret = addStreamedBodyResponse(ret, requestBodyBytes)
- 			return ret, nil
-		} else {
-			ret = append(ret, &eppb.ProcessingResponse{
-+		}
-+
-+		// non-streaming: return a body response with the (possibly) mutated body
-+		return []*eppb.ProcessingResponse{
-+			{
- 				Response: &eppb.ProcessingResponse_RequestBody{
-					RequestBody: &eppb.BodyResponse{},
-+					RequestBody: &eppb.BodyResponse{
-+						Response: &eppb.CommonResponse{
-+							BodyMutation: &eppb.BodyMutation{
-+								Mutation: &eppb.BodyMutation_Body{
-+									Body: requestBodyBytes,
-+								},
-+							},
-+						},
-+					},
- 				},
-			})
-		}
-		return ret, nil
-+			},
-+		}, nil
- 	}
- 
- 	modelStr, ok := modelVal.(string)
-@@ -73,6 +124,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 	metrics.RecordSuccessCounter()
- 
- 	if s.streaming {
-+		// set the model header, then stream the (possibly) mutated body
- 		ret = append(ret, &eppb.ProcessingResponse{
- 			Response: &eppb.ProcessingResponse_RequestHeaders{
- 				RequestHeaders: &eppb.HeadersResponse{
-@@ -86,16 +138,42 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 										RawValue: []byte(modelStr),
- 									},
- 								},
-+								// also keep the worker id header if we have one
-+								func() *basepb.HeaderValueOption {
-+									if strings.TrimSpace(s.workerIDHint) == "" {
-+										return nil
-+									}
-+									return &basepb.HeaderValueOption{
-+										Header: &basepb.HeaderValue{
-+											Key:      workerIDHeader,
-+											RawValue: []byte(s.workerIDHint),
-+										},
-+									}
-+								}(),
- 							},
- 						},
- 					},
- 				},
- 			},
- 		})
-+
-+		// prune nil entries if worker id not present
-+		hm := ret[len(ret)-1].GetRequestHeaders().GetResponse().GetHeaderMutation()
-+		if hm != nil && hm.SetHeaders != nil {
-+			out := hm.SetHeaders[:0]
-+			for _, h := range hm.SetHeaders {
-+				if h != nil {
-+					out = append(out, h)
-+				}
-+			}
-+			hm.SetHeaders = out
-+		}
-+
- 		ret = addStreamedBodyResponse(ret, requestBodyBytes)
- 		return ret, nil
- 	}
- 
-+	// Non-streaming: set model header and replace the body with our mutated JSON
- 	return []*eppb.ProcessingResponse{
- 		{
- 			Response: &eppb.ProcessingResponse_RequestBody{
-@@ -111,6 +189,22 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
- 										RawValue: []byte(modelStr),
- 									},
- 								},
-+								func() *basepb.HeaderValueOption {
-+									if strings.TrimSpace(s.workerIDHint) == "" {
-+										return nil
-+									}
-+									return &basepb.HeaderValueOption{
-+										Header: &basepb.HeaderValue{
-+											Key:      workerIDHeader,
-+											RawValue: []byte(s.workerIDHint),
-+										},
-+									}
-+								}(),
-+							},
-+						},
-+						BodyMutation: &eppb.BodyMutation{
-+							Mutation: &eppb.BodyMutation_Body{
-+								Body: requestBodyBytes,
- 							},
- 						},
- 					},
-@@ -141,6 +235,32 @@ func addStreamedBodyResponse(responses []*eppb.ProcessingResponse, requestBodyBy
- 
- // HandleRequestHeaders handles request headers.
- func (s *Server) HandleRequestHeaders(headers *eppb.HttpHeaders) ([]*eppb.ProcessingResponse, error) {
-+	// reset per-request
-+	s.workerIDHint = ""
-+	s.tokenDataHint = ""
-+
-+	if m := headers.GetHeaders(); m != nil {
-+		for _, h := range m.GetHeaders() {
-+			k := strings.ToLower(h.GetKey())
-+
-+			switch k {
-+			case injectHintHeader, workerIDHeader:
-+				if rv := h.GetRawValue(); len(rv) > 0 {
-+					s.workerIDHint = strings.TrimSpace(string(rv))
-+				} else {
-+					s.workerIDHint = strings.TrimSpace(h.GetValue())
-+				}
-+			case tokenDataHeader:
-+				if rv := h.GetRawValue(); len(rv) > 0 {
-+					s.tokenDataHint = strings.TrimSpace(string(rv))
-+				} else {
-+					s.tokenDataHint = strings.TrimSpace(h.GetValue())
-+				}
-+			}
-+		}
-+	}
-+
-+	// No header mutations needed here; body phase will do the JSON injection.
- 	return []*eppb.ProcessingResponse{
- 		{
- 			Response: &eppb.ProcessingResponse_RequestHeaders{
-diff --git a/pkg/bbr/handlers/server.go b/pkg/bbr/handlers/server.go
-index a580380..eb2893f 100644
--- a/pkg/bbr/handlers/server.go
-+++ b/pkg/bbr/handlers/server.go
-@@ -38,7 +38,9 @@ func NewServer(streaming bool) *Server {
- // Server implements the Envoy external processing server.
- // https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto
- type Server struct {
-	streaming bool
-+	streaming     bool
-+	workerIDHint  string
-+	tokenDataHint string
- }
- 
- func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
-diff --git a/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
-new file mode 100644
-index 0000000..b6708fa
--- /dev/null
-+++ b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
-@@ -0,0 +1,69 @@
-+package dynamo_inject_workerid
-+
-+import (
-+	"context"
-+	"encoding/json"
-+	"strings"
-+
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+	rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
-+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-+)
-+
-+const (
-+	typeString       = "dynamo-inject-workerid"
-+	pluginName       = "dynamo-inject-workerid"
-+	WorkerIDHeader   = "x-worker-instance-id"
-+	injectHintHeader = "x-epp-inject-nvext-worker-instance-id"
-+	TokenDataHeader  = "x-epp-inject-nvext-token-data"
-+)
-+
-+var _ plugins.Plugin = (*InjectWorkerIDPreRequest)(nil)
-+var _ rc.PreRequest = (*InjectWorkerIDPreRequest)(nil)
-+
-+type InjectWorkerIDPreRequest struct {
-+	typedName plugins.TypedName
-+}
-+
-+func NewInjectWorkerIDPreRequest() *InjectWorkerIDPreRequest {
-+	return &InjectWorkerIDPreRequest{
-+		typedName: plugins.TypedName{Type: typeString, Name: pluginName},
-+	}
-+}
-+
-+func (p *InjectWorkerIDPreRequest) WithName(name string) *InjectWorkerIDPreRequest {
-+	p.typedName.Name = name
-+	return p
-+}
-+
-+func InjectWorkerIDPreRequestFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
-+	return NewInjectWorkerIDPreRequest().WithName(name), nil
-+}
-+
-+func (p *InjectWorkerIDPreRequest) TypedName() plugins.TypedName { return p.typedName }
-+
-+func (p *InjectWorkerIDPreRequest) PreRequest(
-+	_ context.Context,
-+	req *schedtypes.LLMRequest,
-+	_ *schedtypes.SchedulingResult,
-+	_ int,
-+) {
-+	if req == nil {
-+		return
-+	}
-+	if req.Headers == nil {
-+		req.Headers = map[string]string{}
-+	}
-+	wid := strings.TrimSpace(req.Headers[WorkerIDHeader])
-+	if wid == "" {
-+		return
-+	}
-+	req.Headers[WorkerIDHeader] = wid
-+	req.Headers[injectHintHeader] = wid
-+
-+	// Pass through token-data header if scorer set it
-+	if td := strings.TrimSpace(req.Headers[TokenDataHeader]); td != "" {
-+		req.Headers[TokenDataHeader] = td
-+	}
-+
-+}
-diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
-new file mode 100644
-index 0000000..b689c00
--- /dev/null
-+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
-@@ -0,0 +1,21 @@
-+# This is an example for configuring the EPP to use the dynamo token-aware kv router for scoring the pods
-+apiVersion: inference.networking.x-k8s.io/v1alpha1
-+kind: EndpointPickerConfig
-+plugins:
-+  # Required: tells EPP which profile to use (even if you only have one)
-+  - type: single-profile-handler
-+
-+  # Picker: chooses the final endpoint after scoring
-+  - name: picker
-+    type: max-score-picker
-+  - name: dyn-pre
-+    type: dynamo-inject-workerid
-+    parameters: {}
-+  - name: dyn-kv
-+    type: kv-aware-scorer
-+schedulingProfiles:
-+  - name: default
-+    plugins:
-+      - pluginRef: dyn-kv
-+        weight: 1
-+      - pluginRef: picker
-diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
-new file mode 100644
-index 0000000..83a4ace
--- /dev/null
-+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
-@@ -0,0 +1,428 @@
-+package dynamo_kv_scorer
-+
-+/*
-+#cgo CPPFLAGS: -I${SRCDIR}/include
-+#cgo CXXFLAGS: -std=c++17
-+#cgo LDFLAGS: ${SRCDIR}/lib/libdynamo_llm_capi.a -lstdc++ -ldl -lpthread -lm
-+
-+#include <stdint.h>
-+#include <stddef.h>
-+#include <stdlib.h>   // for free
-+#include <stdbool.h>
-+
-+// enum underlying type is uint32_t; matches cbindgen output
-+typedef uint32_t dynamo_llm_result_t;
-+enum { DYNAMO_OK = 0, DYNAMO_ERR = 1 };
-+
-+// opaque handle forward-decl
-+struct WorkerSelectionPipeline;
-+typedef struct WorkerSelectionPipeline WorkerSelectionPipeline;
-+
-+// Prototypes (C-compatible)
-+dynamo_llm_result_t dynamo_llm_init(const char *namespace_c_str,
-+                                    const char *component_c_str,
-+                                    int64_t worker_id,
-+                                    uint32_t kv_block_size);
-+
-+dynamo_llm_result_t dynamo_llm_shutdown(void);
-+dynamo_llm_result_t dynamo_llm_load_publisher_create(void);
-+
-+dynamo_llm_result_t dynamo_kv_event_publish_stored(uint64_t event_id,
-+                                                   const uint32_t *token_ids,
-+                                                   const uintptr_t *num_block_tokens,
-+                                                   const uint64_t *block_ids,
-+                                                   size_t num_blocks,
-+                                                   const uint64_t *parent_hash,
-+                                                   uint64_t lora_id);
-+
-+dynamo_llm_result_t dynamo_kv_event_publish_removed(uint64_t event_id,
-+                                                    const uint64_t *block_ids,
-+                                                    size_t num_blocks);
-+
-+dynamo_llm_result_t dynamo_create_worker_selection_pipeline(const char *namespace_c_str,
-+                                                            const char *component_c_str,
-+                                                            const char *model_name_c_str,
-+                                                            bool use_kv_routing,
-+                                                            double busy_threshold,
-+                                                            double overlap_score_weight,
-+                                                            double router_temperature,
-+                                                            bool use_kv_events,
-+                                                            bool router_replica_sync,
-+                                                            WorkerSelectionPipeline **pipeline_out);
-+
-+dynamo_llm_result_t dynamo_destroy_worker_selection_pipeline(WorkerSelectionPipeline *pipeline);
-+
-+dynamo_llm_result_t dynamo_query_worker_selection_and_annotate(WorkerSelectionPipeline *pipeline,
-+                                                               const char *request_json_c_str,
-+                                                               int64_t *worker_instance_id_out,
-+                                                               uint32_t **token_ids_out,
-+                                                               size_t *token_count_out,
-+                                                               char **annotated_request_json_out);
-+
-+dynamo_llm_result_t dynamo_free_worker_selection_result(uint32_t *token_ids,
-+                                                        size_t token_count,
-+                                                        char *annotated_request_json);
-+*/
-+import "C"
-+
-+import (
-+	"context"
-+	"encoding/base64"
-+	"encoding/json"
-+	"fmt"
-+	"os"
-+	"strings"
-+	"sync"
-+	"unsafe"
-+
-+	log "sigs.k8s.io/controller-runtime/pkg/log"
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
-+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-+)
-+
-+const (
-+	PluginName               = "dynamo-kv-scorer"
-+	KVAwareScorerType        = "kv-aware-scorer"
-+	StateKeyWorkerInstanceID = schedtypes.StateKey("dynamo/worker-instance-id")
-+	WorkerIDHeader           = "x-worker-instance-id"
-+	TokenDataHeader          = "x-epp-inject-nvext-token-data"
-+)
-+
-+// --------------------------- config / env ---------------------------
-+
-+var warmupOnce sync.Once
-+var warmupErr error
-+
-+type stateString string
-+type params struct {
-+}
-+
-+func (s stateString) Clone() schedtypes.StateData { return s }
-+
-+type KVAwareScorer struct {
-+	typedName plugins.TypedName
-+}
-+
-+var _ plugins.Plugin = (*KVAwareScorer)(nil)
-+var _ framework.Scorer = (*KVAwareScorer)(nil)
-+
-+func NewKVAwareScorer() *KVAwareScorer {
-+	return &KVAwareScorer{
-+		typedName: plugins.TypedName{Type: KVAwareScorerType, Name: PluginName},
-+	}
-+}
-+
-+func (k *KVAwareScorer) WithName(name string) *KVAwareScorer { k.typedName.Name = name; return k }
-+
-+func KVAwareScorerFactory(name string, raw json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
-+	p := params{}
-+	_ = json.Unmarshal(raw, &p)
-+
-+	s := NewKVAwareScorer().WithName(name)
-+
-+	// one-time FFI init (runtime + persistent pipeline)
-+	warmupOnce.Do(func() {
-+		defer func() {
-+			if r := recover(); r != nil {
-+				warmupErr = fmt.Errorf("Dynamo configuration error: %v", r)
-+			}
-+		}()
-+		warmupErr = initFFI()
-+	})
-+	if warmupErr != nil {
-+		return nil, fmt.Errorf("Dynamo FFI init for the Router failed: %w", warmupErr)
-+	}
-+
-+	return s, nil
-+}
-+
-+func (k *KVAwareScorer) TypedName() plugins.TypedName { return k.typedName }
-+
-+// --------------------------- FFI integration ---------------------------
-+
-+var (
-+	ffiOnce sync.Once
-+	ffiErr  error
-+
-+	ffiNamespace          string
-+	ffiComponent          string
-+	ffiModel              string
-+	ffiOverlapScoreWeight float64
-+	ffiRouterTemperature  float64
-+	ffiKvBlockSize        uint32
-+	ffiWorkerID           int64
-+
-+	runtimeInitialized bool
-+
-+	// Boxed pipeline handle (owned on the Rust side, opaque here)
-+	pipeline      *C.struct_WorkerSelectionPipeline
-+	pipelineMutex sync.RWMutex
-+)
-+
-+func loadDynamoConfig() {
-+	ffiNamespace = getEnvOrDefault("DYNAMO_NAMESPACE", "vllm-agg")
-+	ffiComponent = getEnvOrDefault("DYNAMO_COMPONENT", "backend")
-+	ffiModel = getEnvOrDefault("DYNAMO_MODEL", "Qwen/Qwen3-0.6B")
-+	ffiWorkerID = getEnvInt64OrDefault("DYNAMO_WORKER_ID", 1)
-+
-+	ffiOverlapScoreWeight = getEnvFloatOrDefault("DYNAMO_OVERLAP_SCORE_WEIGHT", -1.0)
-+	ffiRouterTemperature = getEnvFloatOrDefault("DYNAMO_ROUTER_TEMPERATURE", -1.0)
-+
-+	kvBlockSizeStr := os.Getenv("DYNAMO_KV_BLOCK_SIZE")
-+	if kvBlockSizeStr == "" {
-+		panic("DYNAMO_KV_BLOCK_SIZE is required and must match the model card's kv_cache_block_size")
-+	}
-+	var tmp int64
-+	if n, err := fmt.Sscanf(kvBlockSizeStr, "%d", &tmp); err != nil || n != 1 {
-+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE='%s' is not a valid integer", kvBlockSizeStr))
-+	}
-+	ffiKvBlockSize = uint32(tmp)
-+	if ffiKvBlockSize < 16 || ffiKvBlockSize > 8192 {
-+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d outside [16,8192]", ffiKvBlockSize))
-+	}
-+	if (ffiKvBlockSize & (ffiKvBlockSize - 1)) != 0 {
-+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d must be a power of 2", ffiKvBlockSize))
-+	}
-+	fmt.Printf("Dynamo KV Scorer: Loaded DYNAMO_KV_BLOCK_SIZE=%d\n", ffiKvBlockSize)
-+}
-+
-+func getEnvOrDefault(key, def string) string {
-+	if v := os.Getenv(key); v != "" {
-+		return v
-+	}
-+	return def
-+}
-+func getEnvInt64OrDefault(key string, def int64) int64 {
-+	if v := os.Getenv(key); v != "" {
-+		var p int64
-+		if n, err := fmt.Sscanf(v, "%d", &p); err == nil && n == 1 {
-+			return p
-+		}
-+	}
-+	return def
-+}
-+func getEnvFloatOrDefault(key string, def float64) float64 {
-+	if v := os.Getenv(key); v != "" {
-+		var p float64
-+		if n, err := fmt.Sscanf(v, "%f", &p); err == nil && n == 1 {
-+			return p
-+		}
-+	}
-+	return def
-+}
-+func getEnvBoolOrDefault(key string, def bool) bool {
-+	if v := os.Getenv(key); v != "" {
-+		switch strings.ToLower(v) {
-+		case "true", "1", "yes", "on":
-+			return true
-+		case "false", "0", "no", "off":
-+			return false
-+		}
-+	}
-+	return def
-+}
-+
-+// initFFI: initialize runtime and create a persistent boxed pipeline.
-+func initFFI() error {
-+	ffiOnce.Do(func() {
-+		loadDynamoConfig()
-+
-+		ns := C.CString(ffiNamespace)
-+		cm := C.CString(ffiComponent)
-+		model := C.CString(ffiModel)
-+		defer C.free(unsafe.Pointer(ns))
-+		defer C.free(unsafe.Pointer(cm))
-+		defer C.free(unsafe.Pointer(model))
-+
-+		// Init Dynamo runtime
-+		if rc := C.dynamo_llm_init(ns, cm, C.int64_t(ffiWorkerID), C.uint32_t(ffiKvBlockSize)); rc != C.DYNAMO_OK {
-+			ffiErr = fmt.Errorf("dynamo_llm_init failed")
-+			return
-+		}
-+		runtimeInitialized = true
-+
-+		// Create persistent pipeline
-+		pipelineMutex.Lock()
-+		defer pipelineMutex.Unlock()
-+
-+		rc := C.dynamo_create_worker_selection_pipeline(
-+			ns,
-+			cm,
-+			model,
-+			C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_ROUTING", true)),
-+			C.double(getEnvFloatOrDefault("DYNAMO_BUSY_THRESHOLD", -1.0)),
-+			C.double(ffiOverlapScoreWeight),
-+			C.double(ffiRouterTemperature),
-+			C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_EVENTS", true)),
-+			C.bool(getEnvBoolOrDefault("DYNAMO_ROUTER_REPLICA_SYNC", true)),
-+			&pipeline,
-+		)
-+		if rc != C.DYNAMO_OK {
-+			ffiErr = fmt.Errorf("dynamo_create_worker_selection_pipeline failed")
-+			return
-+		}
-+	})
-+	return ffiErr
-+}
-+
-+// --------------------------- scoring ---------------------------
-+
-+func encodeTokenData(tokens []int64) string {
-+	b, _ := json.Marshal(tokens)
-+	return base64.StdEncoding.EncodeToString(b)
-+}
-+
-+func (k *KVAwareScorer) Score(
-+	ctx context.Context,
-+	cycle *schedtypes.CycleState,
-+	req *schedtypes.LLMRequest,
-+	pods []schedtypes.Pod,
-+) map[schedtypes.Pod]float64 {
-+	logger := log.FromContext(ctx)
-+
-+	workerID, tokenData, err := k.callDynamoRouter(ctx, req)
-+	if err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "Dynamo call failed; proceeding without worker id")
-+	} else if workerID != "" {
-+		logger.V(logutil.DEFAULT).Info(
-+			"Dynamo router selected worker",
-+			"workerID", workerID,
-+			"tokenDataCount", len(tokenData),
-+			"tokenData", tokenData,
-+		)
-+		cycle.Write(StateKeyWorkerInstanceID, stateString(workerID))
-+		if req.Headers == nil {
-+			req.Headers = map[string]string{}
-+		}
-+		req.Headers[WorkerIDHeader] = workerID
-+		if len(tokenData) > 0 {
-+			if req.Headers == nil {
-+				req.Headers = map[string]string{}
-+			}
-+			req.Headers[TokenDataHeader] = encodeTokenData(tokenData)
-+		}
-+	}
-+
-+	out := make(map[schedtypes.Pod]float64, len(pods))
-+	for _, p := range pods {
-+		out[p] = 1.0
-+	}
-+	return out
-+}
-+
-+// --------------------------- router call (persistent only) ---------------------------
-+
-+func (k *KVAwareScorer) callDynamoRouter(
-+	ctx context.Context,
-+	req *schedtypes.LLMRequest,
-+) (string, []int64, error) {
-+	logger := log.FromContext(ctx)
-+
-+	if err := initFFI(); err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "FFI init failed")
-+		return "", nil, err
-+	}
-+	if !runtimeInitialized {
-+		return "", nil, fmt.Errorf("dynamo runtime not initialized")
-+	}
-+
-+	pipelineMutex.RLock()
-+	currentPipeline := pipeline
-+	pipelineMutex.RUnlock()
-+
-+	if currentPipeline == nil {
-+		return "", nil, fmt.Errorf("dynamo worker selection pipeline not created")
-+	}
-+
-+	// Build OpenAI-compatible JSON request
-+	requestBody := buildOpenAIRequest(req)
-+	requestJSON, err := json.Marshal(requestBody)
-+	if err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "Failed to marshal OpenAI request")
-+		return "", nil, fmt.Errorf("marshal OpenAI request: %w", err)
-+	}
-+	cRequestJSON := C.CString(string(requestJSON))
-+	defer C.free(unsafe.Pointer(cRequestJSON))
-+
-+	// Output variables
-+	var cWorkerID C.int64_t
-+	var cTokens *C.uint32_t
-+	var cTokenCount C.size_t
-+	var cAnnotatedJSON *C.char
-+
-+	// Call the worker selection pipeline
-+	rc := C.dynamo_query_worker_selection_and_annotate(
-+		currentPipeline,
-+		cRequestJSON,
-+		&cWorkerID,
-+		&cTokens,
-+		&cTokenCount,
-+		&cAnnotatedJSON,
-+	)
-+	if rc != C.DYNAMO_OK {
-+		return "", nil, fmt.Errorf("dynamo_query_worker_selection_and_annotate failed")
-+	}
-+
-+	// Copy tokens into Go memory and free C memory
-+	count := int(uintptr(cTokenCount))
-+	var tokens64 []int64
-+	if count > 0 && cTokens != nil {
-+		src := unsafe.Slice((*uint32)(unsafe.Pointer(cTokens)), count)
-+		tokens64 = make([]int64, count)
-+		for i := 0; i < count; i++ {
-+			tokens64[i] = int64(src[i])
-+		}
-+	}
-+	C.dynamo_free_worker_selection_result(cTokens, cTokenCount, cAnnotatedJSON)
-+
-+	workerID := fmt.Sprintf("%d", int64(cWorkerID))
-+	logger.V(logutil.DEFAULT).Info("Worker selection completed",
-+		"workerID", workerID, "tokenCount", count)
-+
-+	return workerID, tokens64, nil
-+}
-+
-+func buildOpenAIRequest(req *schedtypes.LLMRequest) map[string]any {
-+	requestBody := make(map[string]any)
-+	userText := "default prompt"
-+	if req != nil && strings.TrimSpace(req.Prompt) != "" {
-+		userText = req.Prompt
-+	}
-+	requestBody["messages"] = []map[string]any{{"role": "user", "content": userText}}
-+	if req != nil && strings.TrimSpace(req.TargetModel) != "" {
-+		requestBody["model"] = req.TargetModel
-+	} else {
-+		requestBody["model"] = ffiModel
-+	}
-+	requestBody["max_tokens"] = 1
-+	requestBody["temperature"] = 0.0
-+	requestBody["stream"] = true
-+	requestBody["nvext"] = map[string]any{
-+		"annotations": []string{"query_instance_id"},
-+	}
-+	return requestBody
-+}
-+
-+// --------------------------- shutdown ---------------------------
-+
-+func cleanupDynamo() error {
-+	pipelineMutex.Lock()
-+	defer pipelineMutex.Unlock()
-+
-+	if pipeline != nil {
-+		if rc := C.dynamo_destroy_worker_selection_pipeline(pipeline); rc != C.DYNAMO_OK {
-+			fmt.Printf("Warning: dynamo_destroy_worker_selection_pipeline failed\n")
-+		}
-+		pipeline = nil
-+	}
-+
-+	if runtimeInitialized {
-+		if rc := C.dynamo_llm_shutdown(); rc != C.DYNAMO_OK {
-+			return fmt.Errorf("dynamo_llm_shutdown failed")
-+		}
-+		runtimeInitialized = false
-+	}
-+	return nil
-+}
--- a/deploy/inference-gateway/epp-patches/v0.8.0/gaie.patch
+++ b/deploy/inference-gateway/epp-patches/v0.8.0/gaie.patch
-diff --git a/Dockerfile b/Dockerfile
-deleted file mode 100644
-index fb73765..0000000
--- a/Dockerfile
-+++ /dev/null
-@@ -1,33 +0,0 @@
-# Dockerfile has specific requirement to put this ARG at the beginning:
-# https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
-ARG BUILDER_IMAGE=golang:1.24
-ARG BASE_IMAGE=gcr.io/distroless/static:nonroot
-
-## Multistage build
-FROM ${BUILDER_IMAGE} AS builder
-ENV CGO_ENABLED=0
-ENV GOOS=linux
-ENV GOARCH=amd64
-ARG COMMIT_SHA=unknown
-ARG BUILD_REF
-
-# Dependencies
-WORKDIR /src
-COPY go.mod go.sum ./
-RUN go mod download
-
-# Sources
-COPY cmd/epp ./cmd/epp
-COPY pkg/epp ./pkg/epp
-COPY internal ./internal
-COPY api ./api
-WORKDIR /src/cmd/epp
-RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" -o /epp
-
-## Multistage deploy
-FROM ${BASE_IMAGE}
-
-WORKDIR /
-COPY --from=builder /epp /epp
-
-ENTRYPOINT ["/epp"]
-diff --git a/Makefile b/Makefile
-index dee7e99..d3f9ec7 100644
--- a/Makefile
-+++ b/Makefile
-@@ -170,6 +170,49 @@ verify-all:
- 
- ##@ Build
- 
-+##@ Dynamo EPP with FFI
-+
-+# Build the Dynamo EPP image with CGO static library support
-+.PHONY: dynamo-image-local-build
-+dynamo-image-local-build: ## Build the Dynamo EPP image using Docker Buildx for local development.
-+	BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
-+	$(MAKE) dynamo-image-build PUSH=$(PUSH)
-+	$(MAKE) dynamo-image-build LOAD=$(LOAD)
-+	$(DOCKER_BUILDX_CMD) rm $$BUILDER
-+
-+.PHONY: dynamo-image-local-push
-+dynamo-image-local-push: PUSH=--push ## Build the Dynamo EPP image for local development and push it to $IMAGE_REPO.
-+dynamo-image-local-push: dynamo-image-local-build
-+
-+.PHONY: dynamo-image-local-load
-+dynamo-image-local-load: LOAD=--load ## Build the Dynamo EPP image for local development and load it in the local Docker registry.
-+dynamo-image-local-load: dynamo-image-local-build
-+
-+.PHONY: dynamo-image-build
-+dynamo-image-build: ## Build the Dynamo EPP image using Docker Buildx with CGO support.
-+	$(IMAGE_BUILD_CMD) -f Dockerfile.dynamo -t $(IMAGE_TAG) \
-+		--platform=$(PLATFORMS) \
-+		--build-arg DOCKER_PROXY=$(DOCKER_PROXY) \
-+		--build-arg BASE_IMAGE=ubuntu:24.04 \
-+		--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
-+		--build-arg COMMIT_SHA=${GIT_COMMIT_SHA} \
-+		--build-arg BUILD_REF=${BUILD_REF} \
-+		$(PUSH) \
-+		$(LOAD) \
-+		$(IMAGE_BUILD_EXTRA_OPTS) ./
-+
-+.PHONY: dynamo-image-push
-+dynamo-image-push: PUSH=--push ## Build the Dynamo EPP image and push it to $IMAGE_REPO.
-+dynamo-image-push: dynamo-image-build
-+
-+.PHONY: dynamo-image-load
-+dynamo-image-load: LOAD=--load ## Build the Dynamo EPP image and load it in the local Docker registry.
-+dynamo-image-load: dynamo-image-build
-+
-+.PHONY: dynamo-image-kind
-+dynamo-image-kind: dynamo-image-build ## Build the Dynamo EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default).
-+	kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
-+
- # Build the container image
- .PHONY: image-local-build
- image-local-build: ## Build the EPP image using Docker Buildx for local development.
-diff --git a/cmd/epp/main.go b/cmd/epp/main.go
-index b5e0617..b5c0312 100644
--- a/cmd/epp/main.go
-+++ b/cmd/epp/main.go
-@@ -22,6 +22,12 @@ import (
- 	ctrl "sigs.k8s.io/controller-runtime"
- 
- 	"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
-+	eppplugins "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+
-+	// Dynamo plugins
-+	dyncleanup "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/dynamo_cleanup"
-+	dynprereq "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid"
-+	dynscorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/dynamo_kv_scorer"
- )
- 
- func main() {
-@@ -30,6 +36,10 @@ func main() {
- 	// For adding out-of-tree plugins to the plugins registry, use the following:
- 	// plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function)
- 
-+	eppplugins.Register("dynamo-inject-workerid", dynprereq.InjectWorkerIDPreRequestFactory)
-+	eppplugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
-+	eppplugins.Register("dynamo-cleanup", dyncleanup.DynamoCleanupPluginFactory)
-+
- 	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
- 		os.Exit(1)
- 	}
-diff --git a/pkg/epp/requestcontrol/body_mutator.go b/pkg/epp/requestcontrol/body_mutator.go
-new file mode 100644
-index 0000000..de87445
--- /dev/null
-+++ b/pkg/epp/requestcontrol/body_mutator.go
-@@ -0,0 +1,19 @@
-+package requestcontrol
-+
-+import (
-+	"context"
-+
-+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-+)
-+
-+// RequestBodyMutator allows pre-request plugins to mutate the outbound request body.
-+// Implementations are invoked after the standard PreRequest hook completes.
-+type RequestBodyMutator interface {
-+	MutateRequestBody(
-+		ctx context.Context,
-+		request *schedtypes.LLMRequest,
-+		schedulingResult *schedtypes.SchedulingResult,
-+		targetPort int,
-+		body map[string]any,
-+	)
-+}
-diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go
-index 670d922..0cf04cb 100644
--- a/pkg/epp/requestcontrol/director.go
-+++ b/pkg/epp/requestcontrol/director.go
-@@ -130,6 +130,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
- 		TargetModel: reqCtx.ResolvedTargetModel,
- 		Prompt:      prompt,
- 		Headers:     reqCtx.Request.Headers,
-+		Annotations: map[string]any{},
- 	}
- 
- 	logger = logger.WithValues("model", reqCtx.Model, "resolvedTargetModel", reqCtx.ResolvedTargetModel, "criticality", requestCriticality)
-@@ -253,7 +254,7 @@ func (d *Director) prepareRequest(ctx context.Context, reqCtx *handlers.RequestC
- 	reqCtx.TargetPod = targetPod
- 	reqCtx.TargetEndpoint = endpoint
- 
-	d.runPreRequestPlugins(ctx, reqCtx.SchedulingRequest, result, targetPort)
-+	d.runPreRequestPlugins(ctx, reqCtx.SchedulingRequest, result, targetPort, reqCtx.Request.Body)
- 
- 	return reqCtx, nil
- }
-@@ -319,13 +320,20 @@ func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed
- 	return ""
- }
- 
-func (d *Director) runPreRequestPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, schedulingResult *schedulingtypes.SchedulingResult,
-+func (d *Director) runPreRequestPlugins(
-+	ctx context.Context,
-+	request *schedulingtypes.LLMRequest,
-+	schedulingResult *schedulingtypes.SchedulingResult,
- 	targetPort int,
-+	body map[string]any,
- ) {
- 	for _, plugin := range d.preRequestPlugins {
- 		log.FromContext(ctx).V(logutil.DEBUG).Info("Running pre-request plugin", "plugin", plugin.TypedName().Type)
- 		before := time.Now()
- 		plugin.PreRequest(ctx, request, schedulingResult, targetPort)
-+		if mutator, ok := plugin.(RequestBodyMutator); ok && body != nil {
-+			mutator.MutateRequestBody(ctx, request, schedulingResult, targetPort, body)
-+		}
- 		metrics.RecordRequestControlPluginProcessingLatency(PreRequestPluginType, plugin.TypedName().Type, time.Since(before))
- 	}
- }
-diff --git a/pkg/epp/requestcontrol/plugins/dynamo_cleanup/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_cleanup/plugin.go
-new file mode 100644
-index 0000000..a389372
--- /dev/null
-+++ b/pkg/epp/requestcontrol/plugins/dynamo_cleanup/plugin.go
-@@ -0,0 +1,86 @@
-+package dynamo_cleanup
-+
-+import (
-+	"context"
-+	"encoding/json"
-+
-+	log "sigs.k8s.io/controller-runtime/pkg/log"
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+	rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
-+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-+
-+	dynamo "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/dynamo_kv_scorer"
-+)
-+
-+const (
-+	PluginName = "dynamo-cleanup"
-+	PluginType = "dynamo-cleanup"
-+)
-+
-+// DynamoCleanupPlugin is a PostResponse plugin that cleans up router state
-+// when a request completes. It calls dynamo_router_free_request to release
-+// the bookkeeping resources associated with the request.
-+type DynamoCleanupPlugin struct {
-+	typedName plugins.TypedName
-+}
-+
-+var _ plugins.Plugin = (*DynamoCleanupPlugin)(nil)
-+var _ rc.PostResponse = (*DynamoCleanupPlugin)(nil)
-+
-+// NewDynamoCleanupPlugin creates a new DynamoCleanupPlugin instance.
-+func NewDynamoCleanupPlugin() *DynamoCleanupPlugin {
-+	return &DynamoCleanupPlugin{
-+		typedName: plugins.TypedName{Type: PluginType, Name: PluginName},
-+	}
-+}
-+
-+// WithName sets a custom name for the plugin.
-+func (p *DynamoCleanupPlugin) WithName(name string) *DynamoCleanupPlugin {
-+	p.typedName.Name = name
-+	return p
-+}
-+
-+// DynamoCleanupPluginFactory creates a DynamoCleanupPlugin from configuration.
-+func DynamoCleanupPluginFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
-+	return NewDynamoCleanupPlugin().WithName(name), nil
-+}
-+
-+// TypedName returns the plugin's type and name.
-+func (p *DynamoCleanupPlugin) TypedName() plugins.TypedName {
-+	return p.typedName
-+}
-+
-+// PostResponse is called after a response is received from the model server.
-+// It cleans up the router bookkeeping state for the completed request.
-+func (p *DynamoCleanupPlugin) PostResponse(
-+	ctx context.Context,
-+	request *schedtypes.LLMRequest,
-+	response *rc.Response,
-+	targetPod *backend.Pod,
-+) {
-+	logger := log.FromContext(ctx)
-+
-+	if request == nil {
-+		logger.V(logutil.DEBUG).Info("DynamoCleanupPlugin: request is nil, skipping cleanup")
-+		return
-+	}
-+
-+	requestID := request.RequestId
-+	if requestID == "" {
-+		logger.V(logutil.DEBUG).Info("DynamoCleanupPlugin: no request ID, skipping cleanup")
-+		return
-+	}
-+
-+	// Call the dynamo router to free the request bookkeeping
-+	if err := dynamo.CallFreeRequest(requestID); err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "DynamoCleanupPlugin: failed to free request",
-+			"requestID", requestID)
-+		return
-+	}
-+
-+	logger.V(logutil.VERBOSE).Info("DynamoCleanupPlugin: freed request from router",
-+		"requestID", requestID)
-+}
-+
-diff --git a/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
-new file mode 100644
-index 0000000..1c8f979
--- /dev/null
-+++ b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
-@@ -0,0 +1,171 @@
-+package dynamo_inject_workerid
-+
-+import (
-+	"context"
-+	"encoding/json"
-+	"strconv"
-+	"strings"
-+
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+	rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
-+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-+)
-+
-+const (
-+	typeString             = "dynamo-inject-workerid"
-+	pluginName             = "dynamo-inject-workerid"
-+	WorkerIDHeader         = "x-worker-instance-id"
-+	PrefillWorkerIDHeader  = "x-prefiller-host-port"
-+	tokenDataAnnotationKey = "dynamo/token-data"
-+)
-+
-+var _ plugins.Plugin = (*InjectWorkerIDPreRequest)(nil)
-+var _ rc.PreRequest = (*InjectWorkerIDPreRequest)(nil)
-+var _ rc.RequestBodyMutator = (*InjectWorkerIDPreRequest)(nil)
-+
-+type InjectWorkerIDPreRequest struct {
-+	typedName plugins.TypedName
-+}
-+
-+func NewInjectWorkerIDPreRequest() *InjectWorkerIDPreRequest {
-+	return &InjectWorkerIDPreRequest{
-+		typedName: plugins.TypedName{Type: typeString, Name: pluginName},
-+	}
-+}
-+
-+func (p *InjectWorkerIDPreRequest) WithName(name string) *InjectWorkerIDPreRequest {
-+	p.typedName.Name = name
-+	return p
-+}
-+
-+func InjectWorkerIDPreRequestFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
-+	return NewInjectWorkerIDPreRequest().WithName(name), nil
-+}
-+
-+func (p *InjectWorkerIDPreRequest) TypedName() plugins.TypedName { return p.typedName }
-+
-+func (p *InjectWorkerIDPreRequest) PreRequest(
-+	_ context.Context,
-+	req *schedtypes.LLMRequest,
-+	_ *schedtypes.SchedulingResult,
-+	_ int,
-+) {
-+	if req == nil {
-+		return
-+	}
-+	if req.Headers == nil {
-+		req.Headers = map[string]string{}
-+	}
-+
-+	// Handle worker instance ID
-+	wid := strings.TrimSpace(req.Headers[WorkerIDHeader])
-+	if wid != "" {
-+		req.Headers[WorkerIDHeader] = wid
-+	}
-+
-+	// Handle prefill worker ID
-+	prefillWid := strings.TrimSpace(req.Headers[PrefillWorkerIDHeader])
-+	if prefillWid != "" {
-+		req.Headers[PrefillWorkerIDHeader] = prefillWid
-+	}
-+}
-+
-+func (p *InjectWorkerIDPreRequest) MutateRequestBody(
-+	_ context.Context,
-+	req *schedtypes.LLMRequest,
-+	_ *schedtypes.SchedulingResult,
-+	_ int,
-+	body map[string]any,
-+) {
-+	if req == nil || body == nil {
-+		return
-+	}
-+	if req.Headers == nil {
-+		return
-+	}
-+
-+	wid := strings.TrimSpace(req.Headers[WorkerIDHeader])
-+	if wid == "" {
-+		return
-+	}
-+
-+	prefillWid := strings.TrimSpace(req.Headers[PrefillWorkerIDHeader])
-+
-+	nvext, _ := body["nvext"].(map[string]any)
-+	if nvext == nil {
-+		nvext = map[string]any{}
-+		body["nvext"] = nvext
-+	}
-+
-+	if prefillWid != "" && prefillWid != wid {
-+		// Disaggregated mode: use prefill_worker_id and decode_worker_id
-+		if prefillWidUint, err := strconv.ParseUint(prefillWid, 10, 64); err == nil {
-+			nvext["prefill_worker_id"] = prefillWidUint
-+		}
-+		if widUint, err := strconv.ParseUint(wid, 10, 64); err == nil {
-+			nvext["decode_worker_id"] = widUint
-+		}
-+	} else {
-+		// Aggregated mode (empty prefill or prefill == decode): use backend_instance_id
-+		if widUint, err := strconv.ParseUint(wid, 10, 64); err == nil {
-+			nvext["backend_instance_id"] = widUint
-+		}
-+	}
-+
-+	if tokens, ok := req.Annotations[tokenDataAnnotationKey]; ok {
-+		switch v := tokens.(type) {
-+		case []int64:
-+			if len(v) > 0 {
-+				nvext["token_data"] = v
-+			}
-+		case []any:
-+			var out []int64
-+			for _, elem := range v {
-+				switch t := elem.(type) {
-+				case int64:
-+					out = append(out, t)
-+				case float64:
-+					out = append(out, int64(t))
-+				}
-+			}
-+			if len(out) > 0 {
-+				nvext["token_data"] = out
-+			}
-+		case json.RawMessage:
-+			var out []int64
-+			if err := json.Unmarshal(v, &out); err == nil && len(out) > 0 {
-+				nvext["token_data"] = out
-+			}
-+		}
-+	}
-+
-+	// Remove query_instance_id from nvext.annotations if present
-+	if annotations, ok := nvext["annotations"]; ok {
-+		switch annList := annotations.(type) {
-+		case []string:
-+			filtered := make([]string, 0, len(annList))
-+			for _, ann := range annList {
-+				if ann != "query_instance_id" {
-+					filtered = append(filtered, ann)
-+				}
-+			}
-+			if len(filtered) == 0 {
-+				delete(nvext, "annotations")
-+			} else {
-+				nvext["annotations"] = filtered
-+			}
-+		case []any:
-+			filtered := make([]any, 0, len(annList))
-+			for _, ann := range annList {
-+				if str, ok := ann.(string); !ok || str != "query_instance_id" {
-+					filtered = append(filtered, ann)
-+				}
-+			}
-+			if len(filtered) == 0 {
-+				delete(nvext, "annotations")
-+			} else {
-+				nvext["annotations"] = filtered
-+			}
-+		}
-+	}
-+}
-diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
-new file mode 100644
-index 0000000..e94b72b
--- /dev/null
-+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
-@@ -0,0 +1,24 @@
-+# This is an example for configuring the EPP to use the dynamo token-aware kv router for scoring the pods
-+apiVersion: inference.networking.x-k8s.io/v1alpha1
-+kind: EndpointPickerConfig
-+plugins:
-+  # Required: tells EPP which profile to use (even if you only have one)
-+  - type: single-profile-handler
-+
-+  # Picker: chooses the final endpoint after scoring
-+  - name: picker
-+    type: max-score-picker
-+  - name: dyn-pre
-+    type: dynamo-inject-workerid
-+    parameters: {}
-+  - name: dyn-kv
-+    type: kv-aware-scorer
-+  # Cleanup: frees router bookkeeping when request completes
-+  - name: dyn-cleanup
-+    type: dynamo-cleanup
-+schedulingProfiles:
-+  - name: default
-+    plugins:
-+      - pluginRef: dyn-kv
-+        weight: 1
-+      - pluginRef: picker
-diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
-new file mode 100644
-index 0000000..31af16e
--- /dev/null
-+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
-@@ -0,0 +1,587 @@
-+package dynamo_kv_scorer
-+
-+/*
-+#cgo CPPFLAGS: -I${SRCDIR}/include
-+#cgo CXXFLAGS: -std=c++17
-+#cgo LDFLAGS: ${SRCDIR}/lib/libdynamo_llm_capi.a -lstdc++ -ldl -lpthread -lm
-+
-+#include <stdint.h>
-+#include <stddef.h>
-+#include <stdlib.h>   // for free
-+#include <stdbool.h>
-+
-+// enum underlying type is uint32_t; matches cbindgen output
-+typedef uint32_t dynamo_llm_result_t;
-+enum { DYNAMO_OK = 0, DYNAMO_ERR = 1 };
-+
-+// opaque handle forward-decl
-+struct WorkerSelectionPipeline;
-+typedef struct WorkerSelectionPipeline WorkerSelectionPipeline;
-+
-+// Prototypes (C-compatible)
-+dynamo_llm_result_t dynamo_llm_init(const char *namespace_c_str,
-+                                    const char *component_c_str,
-+                                    int64_t worker_id,
-+                                    uint32_t kv_block_size);
-+
-+dynamo_llm_result_t dynamo_llm_shutdown(void);
-+dynamo_llm_result_t dynamo_llm_load_publisher_create(void);
-+
-+dynamo_llm_result_t dynamo_kv_event_publish_stored(uint64_t event_id,
-+                                                   const uint32_t *token_ids,
-+                                                   const uintptr_t *num_block_tokens,
-+                                                   const uint64_t *block_ids,
-+                                                   size_t num_blocks,
-+                                                   const uint64_t *parent_hash,
-+                                                   uint64_t lora_id);
-+
-+dynamo_llm_result_t dynamo_kv_event_publish_removed(uint64_t event_id,
-+                                                    const uint64_t *block_ids,
-+                                                    size_t num_blocks);
-+
-+dynamo_llm_result_t dynamo_create_worker_selection_pipeline(const char *namespace_c_str,
-+                                                            const char *component_c_str,
-+                                                            const char *model_name_c_str,
-+                                                            bool use_kv_routing,
-+                                                            double busy_threshold,
-+                                                            double overlap_score_weight,
-+                                                            double router_temperature,
-+                                                            bool use_kv_events,
-+                                                            bool router_replica_sync,
-+                                                            bool enforce_disagg,
-+                                                            WorkerSelectionPipeline **pipeline_out);
-+
-+dynamo_llm_result_t dynamo_destroy_worker_selection_pipeline(WorkerSelectionPipeline *pipeline);
-+
-+dynamo_llm_result_t dynamo_query_worker_selection_and_annotate(WorkerSelectionPipeline *pipeline,
-+                                                               const char *request_json_c_str,
-+                                                               int64_t *decode_worker_id_out,
-+                                                               int64_t *prefill_worker_id_out,
-+                                                               uint32_t **token_ids_out,
-+                                                               size_t *token_count_out,
-+                                                               char **annotated_request_json_out);
-+
-+dynamo_llm_result_t dynamo_free_worker_selection_result(uint32_t *token_ids,
-+                                                        size_t token_count,
-+                                                        char *annotated_request_json);
-+
-+// Router bookkeeping functions for GAIE integration
-+dynamo_llm_result_t dynamo_router_add_request(WorkerSelectionPipeline *pipeline,
-+                                              const char *request_id_c_str,
-+                                              const uint32_t *token_ids,
-+                                              size_t token_count,
-+                                              uint64_t worker_id,
-+                                              uint32_t dp_rank);
-+
-+dynamo_llm_result_t dynamo_router_mark_prefill_complete(WorkerSelectionPipeline *pipeline,
-+                                                        const char *request_id_c_str);
-+
-+dynamo_llm_result_t dynamo_router_free_request(WorkerSelectionPipeline *pipeline,
-+                                               const char *request_id_c_str);
-+*/
-+import "C"
-+
-+import (
-+	"context"
-+	"encoding/json"
-+	"fmt"
-+	"os"
-+	"strings"
-+	"sync"
-+	"unsafe"
-+
-+	log "sigs.k8s.io/controller-runtime/pkg/log"
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
-+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
-+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
-+)
-+
-+const (
-+	PluginName               = "dynamo-kv-scorer"
-+	KVAwareScorerType        = "kv-aware-scorer"
-+	StateKeyWorkerInstanceID = schedtypes.StateKey("dynamo/worker-instance-id")
-+	StateKeyPrefillWorkerID  = schedtypes.StateKey("dynamo/prefill-worker-id")
-+	StateKeyRequestID        = schedtypes.StateKey("dynamo/request-id")
-+	WorkerIDHeader           = "x-worker-instance-id"
-+	PrefillWorkerIDHeader    = "x-prefiller-host-port"
-+	tokenDataAnnotationKey   = "dynamo/token-data"
-+)
-+
-+// --------------------------- config / env ---------------------------
-+
-+var warmupOnce sync.Once
-+var warmupErr error
-+
-+type stateString string
-+type params struct {
-+}
-+
-+func (s stateString) Clone() schedtypes.StateData { return s }
-+
-+type KVAwareScorer struct {
-+	typedName plugins.TypedName
-+}
-+
-+var _ plugins.Plugin = (*KVAwareScorer)(nil)
-+var _ framework.Scorer = (*KVAwareScorer)(nil)
-+
-+func NewKVAwareScorer() *KVAwareScorer {
-+	return &KVAwareScorer{
-+		typedName: plugins.TypedName{Type: KVAwareScorerType, Name: PluginName},
-+	}
-+}
-+
-+func (k *KVAwareScorer) WithName(name string) *KVAwareScorer { k.typedName.Name = name; return k }
-+
-+func KVAwareScorerFactory(name string, raw json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
-+	p := params{}
-+	_ = json.Unmarshal(raw, &p)
-+
-+	s := NewKVAwareScorer().WithName(name)
-+
-+	// one-time FFI init (runtime + persistent pipeline)
-+	warmupOnce.Do(func() {
-+		defer func() {
-+			if r := recover(); r != nil {
-+				warmupErr = fmt.Errorf("Dynamo configuration error: %v", r)
-+			}
-+		}()
-+		warmupErr = initFFI()
-+	})
-+	if warmupErr != nil {
-+		return nil, fmt.Errorf("Dynamo FFI init for the Router failed: %w", warmupErr)
-+	}
-+
-+	return s, nil
-+}
-+
-+func (k *KVAwareScorer) TypedName() plugins.TypedName { return k.typedName }
-+
-+// --------------------------- FFI integration ---------------------------
-+
-+var (
-+	ffiOnce sync.Once
-+	ffiErr  error
-+
-+	ffiNamespace          string
-+	ffiComponent          string
-+	ffiModel              string
-+	ffiOverlapScoreWeight float64
-+	ffiRouterTemperature  float64
-+	ffiKvBlockSize        uint32
-+	ffiWorkerID           int64
-+	ffiEnforceDisagg      bool
-+
-+	runtimeInitialized bool
-+
-+	// Boxed pipeline handle (owned on the Rust side, opaque here)
-+	pipeline      *C.struct_WorkerSelectionPipeline
-+	pipelineMutex sync.RWMutex
-+)
-+
-+func loadDynamoConfig() {
-+	ffiNamespace = getEnvOrDefault("DYNAMO_NAMESPACE", "vllm-agg")
-+	ffiComponent = getEnvOrDefault("DYNAMO_COMPONENT", "backend")
-+	ffiModel = getEnvOrDefault("DYNAMO_MODEL", "Qwen/Qwen3-0.6B")
-+	ffiWorkerID = getEnvInt64OrDefault("DYNAMO_WORKER_ID", 1)
-+	ffiEnforceDisagg = getEnvBoolOrDefault("DYNAMO_ENFORCE_DISAGG", false)
-+
-+	ffiOverlapScoreWeight = getEnvFloatOrDefault("DYNAMO_OVERLAP_SCORE_WEIGHT", -1.0)
-+	ffiRouterTemperature = getEnvFloatOrDefault("DYNAMO_ROUTER_TEMPERATURE", -1.0)
-+
-+	kvBlockSizeStr := os.Getenv("DYNAMO_KV_BLOCK_SIZE")
-+	if kvBlockSizeStr == "" {
-+		panic("DYNAMO_KV_BLOCK_SIZE is required and must match the model card's kv_cache_block_size")
-+	}
-+	var tmp int64
-+	if n, err := fmt.Sscanf(kvBlockSizeStr, "%d", &tmp); err != nil || n != 1 {
-+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE='%s' is not a valid integer", kvBlockSizeStr))
-+	}
-+	ffiKvBlockSize = uint32(tmp)
-+	if ffiKvBlockSize < 16 || ffiKvBlockSize > 8192 {
-+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d outside [16,8192]", ffiKvBlockSize))
-+	}
-+	if (ffiKvBlockSize & (ffiKvBlockSize - 1)) != 0 {
-+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d must be a power of 2", ffiKvBlockSize))
-+	}
-+	fmt.Printf("Dynamo KV Scorer: Loaded DYNAMO_KV_BLOCK_SIZE=%d\n", ffiKvBlockSize)
-+}
-+
-+func getEnvOrDefault(key, def string) string {
-+	if v := os.Getenv(key); v != "" {
-+		return v
-+	}
-+	return def
-+}
-+func getEnvInt64OrDefault(key string, def int64) int64 {
-+	if v := os.Getenv(key); v != "" {
-+		var p int64
-+		if n, err := fmt.Sscanf(v, "%d", &p); err == nil && n == 1 {
-+			return p
-+		}
-+	}
-+	return def
-+}
-+func getEnvFloatOrDefault(key string, def float64) float64 {
-+	if v := os.Getenv(key); v != "" {
-+		var p float64
-+		if n, err := fmt.Sscanf(v, "%f", &p); err == nil && n == 1 {
-+			return p
-+		}
-+	}
-+	return def
-+}
-+func getEnvBoolOrDefault(key string, def bool) bool {
-+	if v := os.Getenv(key); v != "" {
-+		switch strings.ToLower(v) {
-+		case "true", "1", "yes", "on":
-+			return true
-+		case "false", "0", "no", "off":
-+			return false
-+		}
-+	}
-+	return def
-+}
-+
-+// initFFI: initialize runtime and create a persistent boxed pipeline.
-+func initFFI() error {
-+	ffiOnce.Do(func() {
-+		loadDynamoConfig()
-+
-+		ns := C.CString(ffiNamespace)
-+		cm := C.CString(ffiComponent)
-+		model := C.CString(ffiModel)
-+		defer C.free(unsafe.Pointer(ns))
-+		defer C.free(unsafe.Pointer(cm))
-+		defer C.free(unsafe.Pointer(model))
-+
-+		// Init Dynamo runtime
-+		if rc := C.dynamo_llm_init(ns, cm, C.int64_t(ffiWorkerID), C.uint32_t(ffiKvBlockSize)); rc != C.DYNAMO_OK {
-+			ffiErr = fmt.Errorf("dynamo_llm_init failed")
-+			return
-+		}
-+		runtimeInitialized = true
-+
-+		// Create persistent pipeline
-+		pipelineMutex.Lock()
-+		defer pipelineMutex.Unlock()
-+
-+		rc := C.dynamo_create_worker_selection_pipeline(
-+			ns,
-+			cm,
-+			model,
-+			C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_ROUTING", true)),
-+			C.double(getEnvFloatOrDefault("DYNAMO_BUSY_THRESHOLD", -1.0)),
-+			C.double(ffiOverlapScoreWeight),
-+			C.double(ffiRouterTemperature),
-+			C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_EVENTS", true)),
-+			C.bool(getEnvBoolOrDefault("DYNAMO_ROUTER_REPLICA_SYNC", true)),
-+			C.bool(ffiEnforceDisagg),
-+			&pipeline,
-+		)
-+		if rc != C.DYNAMO_OK {
-+			ffiErr = fmt.Errorf("dynamo_create_worker_selection_pipeline failed")
-+			return
-+		}
-+	})
-+	return ffiErr
-+}
-+
-+// --------------------------- scoring ---------------------------
-+
-+func (k *KVAwareScorer) Score(
-+	ctx context.Context,
-+	cycle *schedtypes.CycleState,
-+	req *schedtypes.LLMRequest,
-+	pods []schedtypes.Pod,
-+) map[schedtypes.Pod]float64 {
-+	logger := log.FromContext(ctx)
-+
-+	workerID, prefillWorkerID, tokenData, err := k.callDynamoRouter(ctx, req)
-+	if err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "Dynamo call failed; proceeding without worker id")
-+	} else if workerID != "" {
-+		logger.V(logutil.DEFAULT).Info(
-+			"Dynamo router selected worker",
-+			"workerID", workerID,
-+			"prefillWorkerID", prefillWorkerID,
-+			"tokenDataCount", len(tokenData),
-+			"tokenData", tokenData,
-+		)
-+		cycle.Write(StateKeyWorkerInstanceID, stateString(workerID))
-+		if req.Headers == nil {
-+			req.Headers = map[string]string{}
-+		}
-+		req.Headers[WorkerIDHeader] = workerID
-+
-+		// Set prefill worker ID if present
-+		if prefillWorkerID != "" {
-+			cycle.Write(StateKeyPrefillWorkerID, stateString(prefillWorkerID))
-+			req.Headers[PrefillWorkerIDHeader] = prefillWorkerID
-+		}
-+
-+		if len(tokenData) > 0 {
-+			if req.Annotations == nil {
-+				req.Annotations = map[string]any{}
-+			}
-+			copied := make([]int64, len(tokenData))
-+			copy(copied, tokenData)
-+			req.Annotations[tokenDataAnnotationKey] = copied
-+		}
-+
-+		// GAIE Stage 1: Register request with router bookkeeping
-+		// The request ID comes from Envoy's request ID header
-+		requestID := req.RequestId
-+		if requestID != "" {
-+			cycle.Write(StateKeyRequestID, stateString(requestID))
-+			if addErr := k.callAddRequest(ctx, requestID, tokenData, workerID, prefillWorkerID); addErr != nil {
-+				logger.V(logutil.DEFAULT).Error(addErr, "Failed to add request to router bookkeeping",
-+					"requestID", requestID)
-+			}
-+		} else {
-+			logger.V(logutil.VERBOSE).Info("No request ID available, skipping router bookkeeping")
-+		}
-+	}
-+
-+	out := make(map[schedtypes.Pod]float64, len(pods))
-+	for _, p := range pods {
-+		out[p] = 1.0
-+	}
-+	return out
-+}
-+
-+// --------------------------- router call (persistent only) ---------------------------
-+
-+func (k *KVAwareScorer) callDynamoRouter(
-+	ctx context.Context,
-+	req *schedtypes.LLMRequest,
-+) (workerID string, prefillWorkerID string, tokenData []int64, err error) {
-+	logger := log.FromContext(ctx)
-+
-+	if err := initFFI(); err != nil {
-+		logger.V(logutil.DEFAULT).Error(err, "FFI init failed")
-+		return "", "", nil, err
-+	}
-+	if !runtimeInitialized {
-+		return "", "", nil, fmt.Errorf("dynamo runtime not initialized")
-+	}
-+
-+	pipelineMutex.RLock()
-+	currentPipeline := pipeline
-+	pipelineMutex.RUnlock()
-+
-+	if currentPipeline == nil {
-+		return "", "", nil, fmt.Errorf("dynamo worker selection pipeline not created")
-+	}
-+
-+	// Build OpenAI-compatible JSON request
-+	requestBody := buildOpenAIRequest(req)
-+	requestJSON, jsonErr := json.Marshal(requestBody)
-+	if jsonErr != nil {
-+		logger.V(logutil.DEFAULT).Error(jsonErr, "Failed to marshal OpenAI request")
-+		return "", "", nil, fmt.Errorf("marshal OpenAI request: %w", jsonErr)
-+	}
-+	cRequestJSON := C.CString(string(requestJSON))
-+	defer C.free(unsafe.Pointer(cRequestJSON))
-+
-+	// Output variables
-+	var cDecodeWorkerID C.int64_t
-+	var cPrefillWorkerID C.int64_t
-+	var cTokens *C.uint32_t
-+	var cTokenCount C.size_t
-+	var cAnnotatedJSON *C.char
-+
-+	// Call the worker selection pipeline
-+	rc := C.dynamo_query_worker_selection_and_annotate(
-+		currentPipeline,
-+		cRequestJSON,
-+		&cDecodeWorkerID,
-+		&cPrefillWorkerID,
-+		&cTokens,
-+		&cTokenCount,
-+		&cAnnotatedJSON,
-+	)
-+	if rc != C.DYNAMO_OK {
-+		return "", "", nil, fmt.Errorf("dynamo_query_worker_selection_and_annotate failed")
-+	}
-+
-+	// Copy tokens into Go memory and free C memory
-+	count := int(uintptr(cTokenCount))
-+	var tokens64 []int64
-+	if count > 0 && cTokens != nil {
-+		src := unsafe.Slice((*uint32)(unsafe.Pointer(cTokens)), count)
-+		tokens64 = make([]int64, count)
-+		for i := 0; i < count; i++ {
-+			tokens64[i] = int64(src[i])
-+		}
-+	}
-+	C.dynamo_free_worker_selection_result(cTokens, cTokenCount, cAnnotatedJSON)
-+
-+	workerIDStr := fmt.Sprintf("%d", int64(cDecodeWorkerID))
-+	prefillWorkerIDStr := ""
-+	// Rust returns -1 for prefill_worker_id when not in disaggregated mode
-+	if int64(cPrefillWorkerID) >= 0 {
-+		prefillWorkerIDStr = fmt.Sprintf("%d", int64(cPrefillWorkerID))
-+	}
-+	logger.V(logutil.DEFAULT).Info("Worker selection completed",
-+		"workerID", workerIDStr, "prefillWorkerID", prefillWorkerIDStr, "tokenCount", count)
-+
-+	return workerIDStr, prefillWorkerIDStr, tokens64, nil
-+}
-+
-+func buildOpenAIRequest(req *schedtypes.LLMRequest) map[string]any {
-+	requestBody := make(map[string]any)
-+	userText := "default prompt"
-+	if req != nil && strings.TrimSpace(req.Prompt) != "" {
-+		userText = req.Prompt
-+	}
-+	requestBody["messages"] = []map[string]any{{"role": "user", "content": userText}}
-+	if req != nil && strings.TrimSpace(req.TargetModel) != "" {
-+		requestBody["model"] = req.TargetModel
-+	} else {
-+		requestBody["model"] = ffiModel
-+	}
-+	requestBody["max_tokens"] = 1
-+	requestBody["temperature"] = 0.0
-+	requestBody["stream"] = true
-+	requestBody["nvext"] = map[string]any{
-+		"annotations": []string{"query_instance_id"},
-+	}
-+	return requestBody
-+}
-+
-+// --------------------------- router bookkeeping ---------------------------
-+
-+// callAddRequest registers a request with the router's bookkeeping.
-+// This should be called after worker selection to track active requests.
-+func (k *KVAwareScorer) callAddRequest(
-+	ctx context.Context,
-+	requestID string,
-+	tokenData []int64,
-+	workerID string,
-+	prefillWorkerID string,
-+) error {
-+	logger := log.FromContext(ctx)
-+
-+	if !runtimeInitialized {
-+		return fmt.Errorf("dynamo runtime not initialized")
-+	}
-+
-+	pipelineMutex.RLock()
-+	currentPipeline := pipeline
-+	pipelineMutex.RUnlock()
-+
-+	if currentPipeline == nil {
-+		return fmt.Errorf("dynamo worker selection pipeline not created")
-+	}
-+
-+	// Parse worker ID (use decode worker for bookkeeping in disagg mode)
-+	var workerIDUint uint64
-+	if _, err := fmt.Sscanf(workerID, "%d", &workerIDUint); err != nil {
-+		return fmt.Errorf("invalid worker ID: %s", workerID)
-+	}
-+
-+	// Convert token data from int64 to uint32
-+	tokens := make([]uint32, len(tokenData))
-+	for i, t := range tokenData {
-+		tokens[i] = uint32(t)
-+	}
-+
-+	cRequestID := C.CString(requestID)
-+	defer C.free(unsafe.Pointer(cRequestID))
-+
-+	var cTokens *C.uint32_t
-+	if len(tokens) > 0 {
-+		cTokens = (*C.uint32_t)(unsafe.Pointer(&tokens[0]))
-+	}
-+
-+	rc := C.dynamo_router_add_request(
-+		currentPipeline,
-+		cRequestID,
-+		cTokens,
-+		C.size_t(len(tokens)),
-+		C.uint64_t(workerIDUint),
-+		C.uint32_t(0), // dp_rank = 0 for now
-+	)
-+
-+	if rc != C.DYNAMO_OK {
-+		return fmt.Errorf("dynamo_router_add_request failed")
-+	}
-+
-+	logger.V(logutil.VERBOSE).Info("Added request to router bookkeeping",
-+		"requestID", requestID, "workerID", workerID, "tokenCount", len(tokens))
-+	return nil
-+}
-+
-+// CallMarkPrefillComplete marks prefill as completed for a request.
-+// Exported for use by response handlers.
-+func CallMarkPrefillComplete(requestID string) error {
-+	if !runtimeInitialized {
-+		return fmt.Errorf("dynamo runtime not initialized")
-+	}
-+
-+	pipelineMutex.RLock()
-+	currentPipeline := pipeline
-+	pipelineMutex.RUnlock()
-+
-+	if currentPipeline == nil {
-+		return fmt.Errorf("dynamo worker selection pipeline not created")
-+	}
-+
-+	cRequestID := C.CString(requestID)
-+	defer C.free(unsafe.Pointer(cRequestID))
-+
-+	rc := C.dynamo_router_mark_prefill_complete(currentPipeline, cRequestID)
-+	if rc != C.DYNAMO_OK {
-+		return fmt.Errorf("dynamo_router_mark_prefill_complete failed")
-+	}
-+	return nil
-+}
-+
-+// CallFreeRequest cleans up router state for a completed/cancelled request.
-+// Exported for use by response handlers.
-+func CallFreeRequest(requestID string) error {
-+	if !runtimeInitialized {
-+		return fmt.Errorf("dynamo runtime not initialized")
-+	}
-+
-+	pipelineMutex.RLock()
-+	currentPipeline := pipeline
-+	pipelineMutex.RUnlock()
-+
-+	if currentPipeline == nil {
-+		return fmt.Errorf("dynamo worker selection pipeline not created")
-+	}
-+
-+	cRequestID := C.CString(requestID)
-+	defer C.free(unsafe.Pointer(cRequestID))
-+
-+	rc := C.dynamo_router_free_request(currentPipeline, cRequestID)
-+	if rc != C.DYNAMO_OK {
-+		return fmt.Errorf("dynamo_router_free_request failed")
-+	}
-+	return nil
-+}
-+
-+// --------------------------- shutdown ---------------------------
-+
-+func cleanupDynamo() error {
-+	pipelineMutex.Lock()
-+	defer pipelineMutex.Unlock()
-+
-+	if pipeline != nil {
-+		if rc := C.dynamo_destroy_worker_selection_pipeline(pipeline); rc != C.DYNAMO_OK {
-+			fmt.Printf("Warning: dynamo_destroy_worker_selection_pipeline failed\n")
-+		}
-+		pipeline = nil
-+	}
-+
-+	if runtimeInitialized {
-+		if rc := C.dynamo_llm_shutdown(); rc != C.DYNAMO_OK {
-+			return fmt.Errorf("dynamo_llm_shutdown failed")
-+		}
-+		runtimeInitialized = false
-+	}
-+	return nil
-+}
-diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
-index 2962117..7da1d43 100644
--- a/pkg/epp/scheduling/types/types.go
-+++ b/pkg/epp/scheduling/types/types.go
-@@ -33,10 +33,12 @@ type LLMRequest struct {
- 	Prompt string
- 	// Headers is a map of the request headers.
- 	Headers map[string]string
-+	// Annotations provides plugin-specific data that should travel alongside the request.
-+	Annotations map[string]any
- }
- 
- func (r *LLMRequest) String() string {
-	return fmt.Sprintf("RequestID: %s, TargetModel: %s, PromptLength: %d, Headers: %v", r.RequestId, r.TargetModel, len(r.Prompt), r.Headers)
-+	return fmt.Sprintf("RequestID: %s, TargetModel: %s, PromptLength: %d, Headers: %v, Annotations: %v", r.RequestId, r.TargetModel, len(r.Prompt), r.Headers, r.Annotations)
- }
- 
- type Pod interface {
--- a/deploy/inference-gateway/epp/Dockerfile
+++ b/deploy/inference-gateway/epp/Dockerfile
+# SPDX-FileCopyrightText:  Copyright The Kubernetes Authors.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#  http://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#  Modifications Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES
+
+# Dynamo EPP Dockerfile
+# Builds a custom EPP image with Dynamo KV-aware routing plugins
+#
+# PREREQUISITES: Run `make dynamo-lib` before building this image to ensure
+# the Dynamo FFI library and headers are in place.
+
+ARG BUILDER_IMAGE=golang:1.24-bookworm
+ARG BASE_IMAGE=ubuntu:24.04
+
+# =============================================================================
+# Build stage
+# =============================================================================
+FROM ${BUILDER_IMAGE} AS builder
+
+# Docker buildx provides these automatically for multi-platform builds
+ARG TARGETOS=linux
+ARG TARGETARCH
+
+ARG COMMIT_SHA
+ARG BUILD_REF
+
+WORKDIR /workspace
+
+# Install build dependencies for CGO
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    libc-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy go mod files first for better caching
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy the source code (including pre-built Dynamo library)
+COPY . .
+
+# Verify Dynamo library exists
+RUN if [ ! -f "pkg/plugins/dynamo_kv_scorer/lib/libdynamo_llm_capi.a" ]; then \
+        echo "ERROR: Dynamo library not found!"; \
+        echo "Run 'make dynamo-lib' before building the Docker image."; \
+        exit 1; \
+    fi
+
+# Build with CGO enabled for the Dynamo FFI
+# Use TARGETOS/TARGETARCH from Docker buildx for proper platform support
+RUN CGO_ENABLED=1 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build \
+    -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.GitVersion=${BUILD_REF} \
+              -X sigs.k8s.io/gateway-api-inference-extension/version.GitCommit=${COMMIT_SHA}" \
+    -o epp ./cmd/epp
+
+# =============================================================================
+# Runtime stage
+# =============================================================================
+FROM ${BASE_IMAGE}
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    ca-certificates \
+    libstdc++6 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /
+
+# Copy the binary from builder
+COPY --from=builder /workspace/epp .
+
+# Note: EPP config is mounted via Kubernetes ConfigMap at runtime
+# See helm/dynamo-gaie/templates/epp-configmap.yaml
+
+# Create non-root user
+RUN useradd -r -u 65532 -g nogroup nonroot
+USER 65532:65534
+
+ENTRYPOINT ["/epp"]
--- a/deploy/inference-gateway/epp/Makefile
+++ b/deploy/inference-gateway/epp/Makefile
+# Dynamo EPP Makefile
+# Builds custom EPP image with Dynamo KV-aware routing plugins
+
+# Image configuration
+# Image lives in local cache only, not pushed to any registry
+DOCKER_SERVER ?= dynamo
+IMAGE_NAME := dynamo-epp
+GIT_COMMIT_SHA ?= $(shell git rev-parse HEAD 2>/dev/null || echo "unknown")
+GIT_TAG ?= $(shell git describe --tags --dirty --always 2>/dev/null || echo "dev")
+IMAGE_REPO ?= $(DOCKER_SERVER)/$(IMAGE_NAME)
+IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
+
+# Build configuration
+# Auto-detect host architecture for consistent builds with Dynamo library
+# The Dynamo library is built for the host arch, so Docker must match
+HOST_ARCH := $(shell uname -m)
+ifeq ($(HOST_ARCH),x86_64)
+    PLATFORMS ?= linux/amd64
+else ifeq ($(HOST_ARCH),aarch64)
+    PLATFORMS ?= linux/arm64
+else ifeq ($(HOST_ARCH),arm64)
+    PLATFORMS ?= linux/arm64
+else
+    PLATFORMS ?= linux/amd64
+endif
+# Docker proxy for avoiding rate limits (e.g., ECR mirror)
+# Set DOCKER_PROXY to prefix base images, e.g., DOCKER_PROXY=my-registry.com/dockerhub/
+DOCKER_PROXY ?=
+
+DOCKER_BUILDX_CMD ?= docker buildx
+IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
+BUILDER_IMAGE ?= $(DOCKER_PROXY)golang:1.24
+BASE_IMAGE ?= $(DOCKER_PROXY)ubuntu:24.04
+
+# Container tool
+CONTAINER_TOOL ?= docker
+
+# Kind cluster name for local testing
+KIND_CLUSTER ?= kind
+
+# Project directory
+PROJECT_DIR := $(shell pwd)
+
+# Dynamo directories
+# Default: assume we're in dynamo/deploy/inference-gateway/epp
+DYNAMO_DIR ?= $(shell cd $(PROJECT_DIR)/../../.. && pwd)
+DYNAMO_LIB_DIR := $(PROJECT_DIR)/pkg/plugins/dynamo_kv_scorer/lib
+DYNAMO_INCLUDE_DIR := $(PROJECT_DIR)/pkg/plugins/dynamo_kv_scorer/include
+
+.PHONY: help
+help: ## Display this help
+	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
+
+##@ Development
+
+.PHONY: fmt
+fmt: ## Run go fmt
+	go fmt ./...
+
+.PHONY: vet
+vet: ## Run go vet
+	go vet ./...
+
+.PHONY: tidy
+tidy: ## Run go mod tidy
+	go mod tidy
+
+.PHONY: test
+test: ## Run tests
+	CGO_ENABLED=1 go test ./... -v
+
+##@ Build
+
+.PHONY: build
+build: dynamo-lib-check ## Build the EPP binary locally (requires CGO and Dynamo libraries)
+	CGO_ENABLED=1 go build -o bin/epp ./cmd/epp
+
+.PHONY: build-with-lib
+build-with-lib: dynamo-lib build ## Build Dynamo library and EPP binary
+
+.PHONY: image-build
+image-build: dynamo-lib-check ## Build the Docker image using buildx
+	$(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \
+		--platform=$(PLATFORMS) \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
+		--build-arg COMMIT_SHA=$(GIT_COMMIT_SHA) \
+		--build-arg BUILD_REF=$(GIT_TAG) \
+		$(PUSH) \
+		$(LOAD) \
+		.
+
+.PHONY: image-push
+image-push: PUSH=--push ## Build and push the Docker image
+image-push: image-build
+
+.PHONY: image-load
+image-load: LOAD=--load ## Build and load the Docker image locally
+image-load: image-build
+
+.PHONY: image-kind
+image-kind: image-load ## Build and load the image into kind cluster
+	kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
+
+##@ Local Development with Buildx
+
+.PHONY: image-local-build
+image-local-build: ## Build image using a new buildx builder
+	BUILDER=$$($(DOCKER_BUILDX_CMD) create --use) && \
+	$(MAKE) image-build PUSH=$(PUSH) LOAD=$(LOAD) && \
+	$(DOCKER_BUILDX_CMD) rm $$BUILDER
+
+.PHONY: image-local-push
+image-local-push: PUSH=--push ## Build and push using local buildx builder
+image-local-push: image-local-build
+
+.PHONY: image-local-load
+image-local-load: LOAD=--load ## Build and load using local buildx builder
+image-local-load: image-local-build
+
+##@ Dynamo Library Build
+
+.PHONY: dynamo-lib
+dynamo-lib: ## Build Dynamo static library and copy to project
+	@echo "Building Dynamo static library..."
+	cd "$(DYNAMO_DIR)" && cargo build --release -p libdynamo_llm
+	@echo "Generating C header..."
+	@mkdir -p "$(DYNAMO_DIR)/lib/bindings/c/include/nvidia/dynamo_llm"
+	cd "$(DYNAMO_DIR)" && \
+		(cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm \
+			--output lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h || \
+		cp lib/bindings/c/src/fallback_header.h lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h)
+	@echo "Copying files to EPP project..."
+	@mkdir -p "$(DYNAMO_LIB_DIR)"
+	@mkdir -p "$(DYNAMO_INCLUDE_DIR)"
+	cp "$(DYNAMO_DIR)/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h" "$(DYNAMO_INCLUDE_DIR)/"
+	cp "$(DYNAMO_DIR)/target/release/libdynamo_llm_capi.a" "$(DYNAMO_LIB_DIR)/"
+	@echo "Dynamo library ready!"
+
+.PHONY: dynamo-lib-check
+dynamo-lib-check: ## Check if Dynamo library files exist
+	@if [ ! -f "$(DYNAMO_LIB_DIR)/libdynamo_llm_capi.a" ]; then \
+		echo "ERROR: Dynamo library not found. Run 'make dynamo-lib' first."; \
+		exit 1; \
+	fi
+	@if [ ! -f "$(DYNAMO_INCLUDE_DIR)/llm_engine.h" ]; then \
+		echo "ERROR: Dynamo header not found. Run 'make dynamo-lib' first."; \
+		exit 1; \
+	fi
+	@echo "Dynamo library files found."
+
+##@ Clean
+
+.PHONY: clean
+clean: ## Clean build artifacts
+	rm -rf bin/
+	go clean
+
+##@ All-in-one Build
+
+.PHONY: all
+all: dynamo-lib image-local-load ## Build Dynamo lib and Docker image, load locally
+
+.PHONY: all-push
+all-push: dynamo-lib image-push ## Build Dynamo lib and Docker image, push to registry
+
+.PHONY: all-kind
+all-kind: dynamo-lib image-kind ## Build Dynamo lib and Docker image, load to kind
+
+##@ Info
+
+.PHONY: info
+info: ## Show build info
+	@echo "Image Tag: $(IMAGE_TAG)"
+	@echo "Git Commit: $(GIT_COMMIT_SHA)"
+	@echo "Git Tag: $(GIT_TAG)"
+	@echo "Platforms: $(PLATFORMS)"
+	@echo "Docker Proxy: $(DOCKER_PROXY)"
+	@echo "Builder Image: $(BUILDER_IMAGE)"
+	@echo "Base Image: $(BASE_IMAGE)"
+	@echo "Dynamo Dir: $(DYNAMO_DIR)"
+	@echo "Dynamo Lib Dir: $(DYNAMO_LIB_DIR)"
+	@echo "Dynamo Include Dir: $(DYNAMO_INCLUDE_DIR)"
+
--- a/deploy/inference-gateway/epp/cmd/epp/main.go
+++ b/deploy/inference-gateway/epp/cmd/epp/main.go
+/*
+Copyright 2025 NVIDIA Corporation.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Dynamo EPP - Custom Endpoint Picker Plugin for NVIDIA Dynamo
+//
+// This EPP integrates with the Gateway API Inference Extension to provide
+// KV-aware routing for Dynamo inference backends.
+//
+// # Header-Based Routing
+//
+// The Dynamo KV scorer sets routing headers that the Lua filter at the
+// gateway uses to inject nvext into the request body:
+//
+//   - x-worker-instance-id: Selected worker ID (decode worker in disagg mode)
+//   - x-prefiller-host-port: Prefill worker ID (disaggregated mode only)
+//   - x-dynamo-routing-mode: "aggregated" or "disaggregated"
+//
+// The Lua filter reads these headers and injects:
+//   - Aggregated: {"nvext": {"backend_instance_id": <worker_id>}}
+//   - Disaggregated: {"nvext": {"prefill_worker_id": <prefill>, "decode_worker_id": <decode>}}
+package main
+
+import (
+	"os"
+
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+
+	// Dynamo plugins
+	dynscorer "github.com/nvidia/dynamo/deploy/inference-gateway/pkg/plugins/dynamo_kv_scorer"
+)
+
+func main() {
+	// Register Dynamo custom plugins:
+	// - kv-aware-scorer: Implements Scorer, PreRequest, and ResponseComplete interfaces
+	//   - Score: Calls Dynamo router to select workers based on KV cache, sets routing headers
+	//   - PreRequest: Registers request with router bookkeeping after scheduling is finalized
+	//   - ResponseComplete: Cleans up router bookkeeping when response completes
+	plugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
+
+	// Run using standard GAIE runner (it registers built-in plugins automatically)
+	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
+		os.Exit(1)
+	}
+}
--- a/deploy/inference-gateway/epp/go.mod
+++ b/deploy/inference-gateway/epp/go.mod
+module github.com/nvidia/dynamo/deploy/inference-gateway
+
+go 1.24.0
+
+require (
+	sigs.k8s.io/controller-runtime v0.22.4
+	sigs.k8s.io/gateway-api-inference-extension v1.2.1
+)
+
+require (
+	cel.dev/expr v0.24.0 // indirect
+	github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
+	github.com/beorn7/perks v1.0.1 // indirect
+	github.com/blang/semver/v4 v4.0.0 // indirect
+	github.com/cenkalti/backoff/v5 v5.0.3 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
+	github.com/dennwc/varint v1.0.0 // indirect
+	github.com/emicklei/go-restful/v3 v3.13.0 // indirect
+	github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect
+	github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
+	github.com/evanphx/json-patch/v5 v5.9.11 // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
+	github.com/fsnotify/fsnotify v1.9.0 // indirect
+	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
+	github.com/go-logr/zapr v1.3.0 // indirect
+	github.com/go-openapi/jsonpointer v0.21.2 // indirect
+	github.com/go-openapi/jsonreference v0.21.0 // indirect
+	github.com/go-openapi/swag v0.23.1 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
+	github.com/google/btree v1.1.3 // indirect
+	github.com/google/cel-go v0.26.0 // indirect
+	github.com/google/gnostic-models v0.7.0 // indirect
+	github.com/google/go-cmp v0.7.0 // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect
+	github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect
+	github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
+	github.com/inconshreveable/mousetrap v1.1.0 // indirect
+	github.com/josharian/intern v1.0.0 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
+	github.com/mailru/easyjson v0.9.0 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+	github.com/prometheus/client_golang v1.23.2 // indirect
+	github.com/prometheus/client_model v0.6.2 // indirect
+	github.com/prometheus/common v0.67.4 // indirect
+	github.com/prometheus/procfs v0.17.0 // indirect
+	github.com/prometheus/prometheus v0.308.1 // indirect
+	github.com/spf13/cobra v1.9.1 // indirect
+	github.com/spf13/pflag v1.0.10 // indirect
+	github.com/stoewer/go-strcase v1.3.0 // indirect
+	github.com/x448/float16 v0.8.4 // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
+	go.opentelemetry.io/otel v1.39.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect
+	go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 // indirect
+	go.opentelemetry.io/otel/metric v1.39.0 // indirect
+	go.opentelemetry.io/otel/sdk v1.39.0 // indirect
+	go.opentelemetry.io/otel/trace v1.39.0 // indirect
+	go.opentelemetry.io/proto/otlp v1.9.0 // indirect
+	go.uber.org/atomic v1.11.0 // indirect
+	go.uber.org/multierr v1.11.0 // indirect
+	go.uber.org/zap v1.27.1 // indirect
+	go.yaml.in/yaml/v2 v2.4.3 // indirect
+	go.yaml.in/yaml/v3 v3.0.4 // indirect
+	golang.org/x/exp v0.0.0-20250808145144-a408d31f581a // indirect
+	golang.org/x/net v0.47.0 // indirect
+	golang.org/x/oauth2 v0.32.0 // indirect
+	golang.org/x/sync v0.19.0 // indirect
+	golang.org/x/sys v0.39.0 // indirect
+	golang.org/x/term v0.37.0 // indirect
+	golang.org/x/text v0.31.0 // indirect
+	golang.org/x/time v0.13.0 // indirect
+	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect
+	google.golang.org/grpc v1.78.0 // indirect
+	google.golang.org/protobuf v1.36.11 // indirect
+	gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
+	gopkg.in/inf.v0 v0.9.1 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+	k8s.io/api v0.34.3 // indirect
+	k8s.io/apiextensions-apiserver v0.34.3 // indirect
+	k8s.io/apimachinery v0.34.3 // indirect
+	k8s.io/apiserver v0.34.3 // indirect
+	k8s.io/client-go v0.34.3 // indirect
+	k8s.io/component-base v0.34.3 // indirect
+	k8s.io/klog/v2 v2.130.1 // indirect
+	k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
+	k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d // indirect
+	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect
+	sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
+	sigs.k8s.io/randfill v1.0.0 // indirect
+	sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
+	sigs.k8s.io/yaml v1.6.0 // indirect
+)
+
+// NOTE: For local development, uncomment the replace directive below.
+// For Docker builds, keep it commented out to use the published v1.2.1 release.
+// replace sigs.k8s.io/gateway-api-inference-extension => ../../../gaie_latest/gateway-api-inference-extension
--- a/deploy/inference-gateway/epp/go.sum
+++ b/deploy/inference-gateway/epp/go.sum
+cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY=
+cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
+cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4=
+cloud.google.com/go/auth v0.17.0/go.mod h1:6wv/t5/6rOPAX4fJiRjKkJCvswLwdet7G8+UGXt7nCQ=
+cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc=
+cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c=
+cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
+cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
+github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1 h1:5YTBM8QDVIBN3sxBil89WfdAAqDZbyJTgh688DSxX5w=
+github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw=
+github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.12.0 h1:wL5IEG5zb7BVv1Kv0Xm92orq+5hB5Nipn3B5tn4Rqfk=
+github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.12.0/go.mod h1:J7MUC/wtRpfGVbQ5sIItY5/FuVWmvzlY21WAOfQnq/I=
+github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA=
+github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI=
+github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0 h1:XkkQbfMyuH2jTSjQjSoihryI8GINRcs4xp8lNawg0FI=
+github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk=
+github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
+github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
+github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
+github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vSQ6PWWSL9lK8qwHozUj03+zLoEB8O0=
+github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs=
+github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
+github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
+github.com/aws/aws-sdk-go-v2 v1.39.6 h1:2JrPCVgWJm7bm83BDwY5z8ietmeJUbh3O2ACnn+Xsqk=
+github.com/aws/aws-sdk-go-v2 v1.39.6/go.mod h1:c9pm7VwuW0UPxAEYGyTmyurVcNrbF6Rt/wixFqDhcjE=
+github.com/aws/aws-sdk-go-v2/config v1.31.17 h1:QFl8lL6RgakNK86vusim14P2k8BFSxjvUkcWLDjgz9Y=
+github.com/aws/aws-sdk-go-v2/config v1.31.17/go.mod h1:V8P7ILjp/Uef/aX8TjGk6OHZN6IKPM5YW6S78QnRD5c=
+github.com/aws/aws-sdk-go-v2/credentials v1.18.21 h1:56HGpsgnmD+2/KpG0ikvvR8+3v3COCwaF4r+oWwOeNA=
+github.com/aws/aws-sdk-go-v2/credentials v1.18.21/go.mod h1:3YELwedmQbw7cXNaII2Wywd+YY58AmLPwX4LzARgmmA=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 h1:T1brd5dR3/fzNFAQch/iBKeX07/ffu/cLu+q+RuzEWk=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13/go.mod h1:Peg/GBAQ6JDt+RoBf4meB1wylmAipb7Kg2ZFakZTlwk=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 h1:a+8/MLcWlIxo1lF9xaGt3J/u3yOZx+CdSveSNwjhD40=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13/go.mod h1:oGnKwIYZ4XttyU2JWxFrwvhF6YKiK/9/wmE3v3Iu9K8=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 h1:HBSI2kDkMdWz4ZM7FjwE7e/pWDEZ+nR95x8Ztet1ooY=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13/go.mod h1:YE94ZoDArI7awZqJzBAZ3PDD2zSfuP7w6P2knOzIn8M=
+github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk=
+github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 h1:x2Ibm/Af8Fi+BH+Hsn9TXGdT+hKbDd5XOTZxTMxDk7o=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3/go.mod h1:IW1jwyrQgMdhisceG8fQLmQIydcT/jWY21rFhzgaKwo=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 h1:kDqdFvMY4AtKoACfzIGD8A0+hbT41KTKF//gq7jITfM=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13/go.mod h1:lmKuogqSU3HzQCwZ9ZtcqOc5XGMqtDK7OIc2+DxiUEg=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.1 h1:0JPwLz1J+5lEOfy/g0SURC9cxhbQ1lIMHMa+AHZSzz0=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.1/go.mod h1:fKvyjJcz63iL/ftA6RaM8sRCtN4r4zl4tjL3qw5ec7k=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.5 h1:OWs0/j2UYR5LOGi88sD5/lhN6TDLG6SfA7CqsQO9zF0=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.5/go.mod h1:klO+ejMvYsB4QATfEOIXk8WAEwN4N0aBfJpvC+5SZBo=
+github.com/aws/aws-sdk-go-v2/service/sts v1.39.1 h1:mLlUgHn02ue8whiR4BmxxGJLR2gwU6s6ZzJ5wDamBUs=
+github.com/aws/aws-sdk-go-v2/service/sts v1.39.1/go.mod h1:E19xDjpzPZC7LS2knI9E6BaRFDK43Eul7vd6rSq2HWk=
+github.com/aws/smithy-go v1.23.2 h1:Crv0eatJUQhaManss33hS5r40CG3ZFH+21XSkqMrIUM=
+github.com/aws/smithy-go v1.23.2/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0=
+github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 h1:6df1vn4bBlDDo4tARvBm7l6KA9iVMnE3NWizDeWSrps=
+github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3/go.mod h1:CIWtjkly68+yqLPbvwwR/fjNJA/idrtULjZWh2v1ys0=
+github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
+github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
+github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
+github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
+github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
+github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f h1:Y8xYupdHxryycyPlc9Y+bSQAYZnetRJ70VMVKm5CKI0=
+github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f/go.mod h1:HlzOvOjVBOfTGSRXRyY0OiCS/3J1akRGQQpRO/7zyF4=
+github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dennwc/varint v1.0.0 h1:kGNFFSSw8ToIy3obO/kKr8U9GZYUAxQEVuix4zfDWzE=
+github.com/dennwc/varint v1.0.0/go.mod h1:hnItb35rvZvJrbTALZtY/iQfDs48JKRG1RPpgziApxA=
+github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes=
+github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
+github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g=
+github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98=
+github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8=
+github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU=
+github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k=
+github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ=
+github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
+github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM=
+github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
+github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
+github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
+github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
+github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
+github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
+github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
+github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
+github.com/go-openapi/jsonpointer v0.21.2 h1:AqQaNADVwq/VnkCmQg6ogE+M3FOsKTytwges0JdwVuA=
+github.com/go-openapi/jsonpointer v0.21.2/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk=
+github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
+github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
+github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU=
+github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0=
+github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
+github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
+github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
+github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
+github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
+github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
+github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
+github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
+github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
+github.com/google/cel-go v0.26.0 h1:DPGjXackMpJWH680oGY4lZhYjIameYmR+/6RBdDGmaI=
+github.com/google/cel-go v0.26.0/go.mod h1:A9O8OU9rdvrK5MQyrqfIxo1a0u4g3sF8KB6PUIaryMM=
+github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo=
+github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
+github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 h1:ZI8gCoCjGzPsum4L21jHdQs8shFBIQih1TM9Rd/c+EQ=
+github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U=
+github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0=
+github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU9uHLo7OnF5tL52HFAgMmyrf4=
+github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA=
+github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo=
+github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc=
+github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo=
+github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
+github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 h1:cLN4IBkmkYZNnk7EAJ0BHIethd+J6LqxFNw5mSiI2bM=
+github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg=
+github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4=
+github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
+github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
+github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
+github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
+github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
+github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
+github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
+github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
+github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
+github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4=
+github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
+github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
+github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8=
+github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
+github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
+github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
+github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
+github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
+github.com/oklog/ulid/v2 v2.1.1 h1:suPZ4ARWLOJLegGFiZZ1dFAkqzhMjL3J1TzI+5wHz8s=
+github.com/oklog/ulid/v2 v2.1.1/go.mod h1:rcEKHmBBKfef9DhnvX7y1HZBYxjXb0cP5ExxNsTT1QQ=
+github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns=
+github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
+github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
+github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
+github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
+github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
+github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
+github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
+github.com/prometheus/client_golang/exp v0.0.0-20251212205219-7ba246a648ca h1:BOxmsLoL2ymn8lXJtorca7N/m+2vDQUDoEtPjf0iAxA=
+github.com/prometheus/client_golang/exp v0.0.0-20251212205219-7ba246a648ca/go.mod h1:gndBHh3ZdjBozGcGrjUYjN3UJLRS3l2drALtu4lUt+k=
+github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
+github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
+github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc=
+github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI=
+github.com/prometheus/otlptranslator v1.0.0 h1:s0LJW/iN9dkIH+EnhiD3BlkkP5QVIUVEoIwkU+A6qos=
+github.com/prometheus/otlptranslator v1.0.0/go.mod h1:vRYWnXvI6aWGpsdY/mOT/cbeVRBlPWtBNDb7kGR3uKM=
+github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0=
+github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw=
+github.com/prometheus/prometheus v0.308.1 h1:ApMNI/3/es3Ze90Z7CMb+wwU2BsSYur0m5VKeqHj7h4=
+github.com/prometheus/prometheus v0.308.1/go.mod h1:aHjYCDz9zKRyoUXvMWvu13K9XHOkBB12XrEqibs3e0A=
+github.com/prometheus/sigv4 v0.3.0 h1:QIG7nTbu0JTnNidGI1Uwl5AGVIChWUACxn2B/BQ1kms=
+github.com/prometheus/sigv4 v0.3.0/go.mod h1:fKtFYDus2M43CWKMNtGvFNHGXnAJJEGZbiYCmVp/F8I=
+github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
+github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
+github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
+github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
+github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
+github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs=
+github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
+github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
+github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
+go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
+go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48=
+go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 h1:f0cb2XPmrqn4XMy9PNliTgRKJgS5WcL/u0/WRYGz4t0=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0/go.mod h1:vnakAaFckOMiMtOIhFI2MNH4FYrZzXCYxmb1LlhoGz8=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 h1:in9O8ESIOlwJAEGTkkf34DesGRAc/Pn8qJ7k3r/42LM=
+go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0/go.mod h1:Rp0EXBm5tfnv0WL+ARyO/PHBEaEAT8UUHQ6AGJcSq6c=
+go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 h1:8UPA4IbVZxpsD76ihGOQiFml99GPAEZLohDXvqHdi6U=
+go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0/go.mod h1:MZ1T/+51uIVKlRzGw1Fo46KEWThjlCBZKl2LzY5nv4g=
+go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0=
+go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs=
+go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18=
+go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE=
+go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8=
+go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew=
+go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI=
+go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA=
+go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A=
+go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4=
+go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
+go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
+go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
+go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
+go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
+go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
+go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc=
+go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
+go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
+go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
+go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
+go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU=
+golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc=
+golang.org/x/exp v0.0.0-20250808145144-a408d31f581a h1:Y+7uR/b1Mw2iSXZ3G//1haIiSElDQZ8KWh0h+sZPG90=
+golang.org/x/exp v0.0.0-20250808145144-a408d31f581a/go.mod h1:rT6SFzZ7oxADUDx58pcaKFTcZ+inxAa9fTrYx/uVYwg=
+golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA=
+golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
+golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
+golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY=
+golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
+golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU=
+golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
+golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
+golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI=
+golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
+golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
+golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
+golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
+gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
+gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
+gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
+google.golang.org/api v0.252.0 h1:xfKJeAJaMwb8OC9fesr369rjciQ704AjU/psjkKURSI=
+google.golang.org/api v0.252.0/go.mod h1:dnHOv81x5RAmumZ7BWLShB/u7JZNeyalImxHmtTHxqw=
+google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls=
+google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
+google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc=
+google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U=
+google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
+google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo=
+gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
+gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
+gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+k8s.io/api v0.34.3 h1:D12sTP257/jSH2vHV2EDYrb16bS7ULlHpdNdNhEw2S4=
+k8s.io/api v0.34.3/go.mod h1:PyVQBF886Q5RSQZOim7DybQjAbVs8g7gwJNhGtY5MBk=
+k8s.io/apiextensions-apiserver v0.34.3 h1:p10fGlkDY09eWKOTeUSioxwLukJnm+KuDZdrW71y40g=
+k8s.io/apiextensions-apiserver v0.34.3/go.mod h1:aujxvqGFRdb/cmXYfcRTeppN7S2XV/t7WMEc64zB5A0=
+k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE=
+k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
+k8s.io/apiserver v0.34.3 h1:uGH1qpDvSiYG4HVFqc6A3L4CKiX+aBWDrrsxHYK0Bdo=
+k8s.io/apiserver v0.34.3/go.mod h1:QPnnahMO5C2m3lm6fPW3+JmyQbvHZQ8uudAu/493P2w=
+k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A=
+k8s.io/client-go v0.34.3/go.mod h1:OxxeYagaP9Kdf78UrKLa3YZixMCfP6bgPwPwNBQBzpM=
+k8s.io/component-base v0.34.3 h1:zsEgw6ELqK0XncCQomgO9DpUIzlrYuZYA0Cgo+JWpVk=
+k8s.io/component-base v0.34.3/go.mod h1:5iIlD8wPfWE/xSHTRfbjuvUul2WZbI2nOUK65XL0E/c=
+k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
+k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
+k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE=
+k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
+k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d h1:wAhiDyZ4Tdtt7e46e9M5ZSAJ/MnPGPs+Ki1gHw4w1R0=
+k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
+sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
+sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
+sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A=
+sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8=
+sigs.k8s.io/gateway-api-inference-extension v1.2.1 h1:kQjnFWW8YLCN42EZxDNxTuDE0xHkPkoyaEVpQ5sNCBQ=
+sigs.k8s.io/gateway-api-inference-extension v1.2.1/go.mod h1:/HWeqxuOMjFM56YwJ2Spt3qceK7Spz4hk6ZfXYgE9a8=
+sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
+sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
+sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
+sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
+sigs.k8s.io/structured-merge-diff/v6 v6.3.1 h1:JrhdFMqOd/+3ByqlP2I45kTOZmTRLBUm5pvRjeheg7E=
+sigs.k8s.io/structured-merge-diff/v6 v6.3.1/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
+sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
+sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
--- a/deploy/inference-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
+++ b/deploy/inference-gateway/epp/pkg/plugins/dynamo_kv_scorer/plugin.go
+/*
+Copyright 2025 NVIDIA Corporation.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package dynamo_kv_scorer
+
+/*
+#cgo CPPFLAGS: -I${SRCDIR}/include
+#cgo CXXFLAGS: -std=c++17
+#cgo LDFLAGS: ${SRCDIR}/lib/libdynamo_llm_capi.a -lstdc++ -ldl -lpthread -lm
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>   // for free
+#include <stdbool.h>
+
+// enum underlying type is uint32_t; matches cbindgen output
+typedef uint32_t dynamo_llm_result_t;
+enum { DYNAMO_OK = 0, DYNAMO_ERR = 1 };
+
+// opaque handle forward-decl
+struct WorkerSelectionPipeline;
+typedef struct WorkerSelectionPipeline WorkerSelectionPipeline;
+
+// Prototypes (C-compatible)
+dynamo_llm_result_t dynamo_llm_init(const char *namespace_c_str,
+                                    const char *component_c_str,
+                                    int64_t worker_id,
+                                    uint32_t kv_block_size);
+
+dynamo_llm_result_t dynamo_llm_shutdown(void);
+dynamo_llm_result_t dynamo_llm_load_publisher_create(void);
+
+dynamo_llm_result_t dynamo_kv_event_publish_stored(uint64_t event_id,
+                                                   const uint32_t *token_ids,
+                                                   const uintptr_t *num_block_tokens,
+                                                   const uint64_t *block_ids,
+                                                   size_t num_blocks,
+                                                   const uint64_t *parent_hash,
+                                                   uint64_t lora_id);
+
+dynamo_llm_result_t dynamo_kv_event_publish_removed(uint64_t event_id,
+                                                    const uint64_t *block_ids,
+                                                    size_t num_blocks);
+
+dynamo_llm_result_t dynamo_create_worker_selection_pipeline(const char *namespace_c_str,
+                                                            const char *component_c_str,
+                                                            const char *model_name_c_str,
+                                                            bool use_kv_routing,
+                                                            double busy_threshold,
+                                                            double overlap_score_weight,
+                                                            double router_temperature,
+                                                            bool use_kv_events,
+                                                            bool router_replica_sync,
+                                                            bool enforce_disagg,
+                                                            WorkerSelectionPipeline **pipeline_out);
+
+dynamo_llm_result_t dynamo_destroy_worker_selection_pipeline(WorkerSelectionPipeline *pipeline);
+
+dynamo_llm_result_t dynamo_query_worker_selection_and_annotate(WorkerSelectionPipeline *pipeline,
+                                                               const char *request_json_c_str,
+                                                               int64_t *decode_worker_id_out,
+                                                               int64_t *prefill_worker_id_out,
+                                                               uint32_t **token_ids_out,
+                                                               size_t *token_count_out,
+                                                               char **annotated_request_json_out);
+
+dynamo_llm_result_t dynamo_free_worker_selection_result(uint32_t *token_ids,
+                                                        size_t token_count,
+                                                        char *annotated_request_json);
+
+// Router bookkeeping functions for GAIE integration
+dynamo_llm_result_t dynamo_router_add_request(WorkerSelectionPipeline *pipeline,
+                                              const char *request_id_c_str,
+                                              const uint32_t *token_ids,
+                                              size_t token_count,
+                                              uint64_t worker_id,
+                                              uint32_t dp_rank);
+
+dynamo_llm_result_t dynamo_router_mark_prefill_complete(WorkerSelectionPipeline *pipeline,
+                                                        const char *request_id_c_str);
+
+dynamo_llm_result_t dynamo_router_free_request(WorkerSelectionPipeline *pipeline,
+                                               const char *request_id_c_str);
+*/
+import "C"
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"strings"
+	"sync"
+	"unsafe"
+
+	log "sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+	rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
+	schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+const (
+	PluginName            = "dynamo-kv-scorer"
+	KVAwareScorerType     = "kv-aware-scorer"
+	WorkerIDHeader        = "x-worker-instance-id"
+	PrefillWorkerIDHeader = "x-prefill-instance-id"
+	RoutingModeHeader     = "x-dynamo-routing-mode"
+
+	// stateKey is the key used to store routing state in PluginState
+	stateKey = "dynamo-routing-state"
+)
+
+// --------------------------- config / env ---------------------------
+
+var warmupOnce sync.Once
+var warmupErr error
+
+type params struct{}
+
+// DynamoRoutingState holds routing information passed from Score() to PreRequest().
+// This is stored in PluginState keyed by request ID.
+type DynamoRoutingState struct {
+	WorkerID        string
+	PrefillWorkerID string
+	// TokenData holds the token IDs from the router.
+	// Currently unused but stored for future implementation where tokens
+	// may be passed to the worker via request body instead of headers.
+	TokenData []int64
+}
+
+// Clone implements plugins.StateData interface.
+func (s *DynamoRoutingState) Clone() plugins.StateData {
+	if s == nil {
+		return nil
+	}
+	clone := &DynamoRoutingState{
+		WorkerID:        s.WorkerID,
+		PrefillWorkerID: s.PrefillWorkerID,
+	}
+	if s.TokenData != nil {
+		clone.TokenData = make([]int64, len(s.TokenData))
+		copy(clone.TokenData, s.TokenData)
+	}
+	return clone
+}
+
+type KVAwareScorer struct {
+	typedName   plugins.TypedName
+	pluginState *plugins.PluginState
+}
+
+var _ plugins.Plugin = (*KVAwareScorer)(nil)
+var _ framework.Scorer = (*KVAwareScorer)(nil)
+var _ rc.PreRequest = (*KVAwareScorer)(nil)
+var _ rc.ResponseComplete = (*KVAwareScorer)(nil)
+
+func NewKVAwareScorer(ctx context.Context) *KVAwareScorer {
+	return &KVAwareScorer{
+		typedName:   plugins.TypedName{Type: KVAwareScorerType, Name: PluginName},
+		pluginState: plugins.NewPluginState(ctx),
+	}
+}
+
+func (k *KVAwareScorer) WithName(name string) *KVAwareScorer { k.typedName.Name = name; return k }
+
+func KVAwareScorerFactory(name string, raw json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) {
+	p := params{}
+	_ = json.Unmarshal(raw, &p)
+
+	s := NewKVAwareScorer(handle.Context()).WithName(name)
+
+	// one-time FFI init (runtime + persistent pipeline)
+	warmupOnce.Do(func() {
+		defer func() {
+			if r := recover(); r != nil {
+				warmupErr = fmt.Errorf("Dynamo configuration error: %v", r)
+			}
+		}()
+		warmupErr = initFFI()
+	})
+	if warmupErr != nil {
+		return nil, fmt.Errorf("Dynamo FFI init for the Router failed: %w", warmupErr)
+	}
+
+	return s, nil
+}
+
+func (k *KVAwareScorer) TypedName() plugins.TypedName { return k.typedName }
+
+// --------------------------- FFI integration ---------------------------
+
+var (
+	ffiOnce sync.Once
+	ffiErr  error
+
+	ffiNamespace          string
+	ffiComponent          string
+	ffiModel              string
+	ffiOverlapScoreWeight float64
+	ffiRouterTemperature  float64
+	ffiKvBlockSize        uint32
+	ffiWorkerID           int64
+	ffiEnforceDisagg      bool
+
+	runtimeInitialized bool
+
+	// Boxed pipeline handle (owned on the Rust side, opaque here)
+	pipeline      *C.struct_WorkerSelectionPipeline
+	pipelineMutex sync.RWMutex
+)
+
+func loadDynamoConfig() {
+	ffiNamespace = getEnvOrDefault("DYNAMO_NAMESPACE", "vllm-agg")
+	ffiComponent = getEnvOrDefault("DYNAMO_COMPONENT", "backend")
+	ffiModel = getEnvOrDefault("DYNAMO_MODEL", "Qwen/Qwen3-0.6B")
+	ffiWorkerID = getEnvInt64OrDefault("DYNAMO_WORKER_ID", 1)
+	ffiEnforceDisagg = getEnvBoolOrDefault("DYNAMO_ENFORCE_DISAGG", false)
+
+	ffiOverlapScoreWeight = getEnvFloatOrDefault("DYNAMO_OVERLAP_SCORE_WEIGHT", -1.0)
+	ffiRouterTemperature = getEnvFloatOrDefault("DYNAMO_ROUTER_TEMPERATURE", -1.0)
+
+	kvBlockSizeStr := os.Getenv("DYNAMO_KV_BLOCK_SIZE")
+	if kvBlockSizeStr == "" {
+		panic("DYNAMO_KV_BLOCK_SIZE is required and must match the model card's kv_cache_block_size")
+	}
+	var tmp int64
+	if n, err := fmt.Sscanf(kvBlockSizeStr, "%d", &tmp); err != nil || n != 1 {
+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE='%s' is not a valid integer", kvBlockSizeStr))
+	}
+	ffiKvBlockSize = uint32(tmp)
+	if ffiKvBlockSize < 16 || ffiKvBlockSize > 8192 {
+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d outside [16,8192]", ffiKvBlockSize))
+	}
+	if (ffiKvBlockSize & (ffiKvBlockSize - 1)) != 0 {
+		panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d must be a power of 2", ffiKvBlockSize))
+	}
+	fmt.Printf("Dynamo KV Scorer: Loaded DYNAMO_KV_BLOCK_SIZE=%d\n", ffiKvBlockSize)
+}
+
+func getEnvOrDefault(key, def string) string {
+	if v := os.Getenv(key); v != "" {
+		return v
+	}
+	return def
+}
+
+func getEnvInt64OrDefault(key string, def int64) int64 {
+	if v := os.Getenv(key); v != "" {
+		var p int64
+		if n, err := fmt.Sscanf(v, "%d", &p); err == nil && n == 1 {
+			return p
+		}
+	}
+	return def
+}
+
+func getEnvFloatOrDefault(key string, def float64) float64 {
+	if v := os.Getenv(key); v != "" {
+		var p float64
+		if n, err := fmt.Sscanf(v, "%f", &p); err == nil && n == 1 {
+			return p
+		}
+	}
+	return def
+}
+
+func getEnvBoolOrDefault(key string, def bool) bool {
+	if v := os.Getenv(key); v != "" {
+		switch strings.ToLower(v) {
+		case "true", "1", "yes", "on":
+			return true
+		case "false", "0", "no", "off":
+			return false
+		}
+	}
+	return def
+}
+
+// initFFI: initialize runtime and create a persistent boxed pipeline.
+func initFFI() error {
+	ffiOnce.Do(func() {
+		loadDynamoConfig()
+
+		ns := C.CString(ffiNamespace)
+		cm := C.CString(ffiComponent)
+		model := C.CString(ffiModel)
+		defer C.free(unsafe.Pointer(ns))
+		defer C.free(unsafe.Pointer(cm))
+		defer C.free(unsafe.Pointer(model))
+
+		// Init Dynamo runtime
+		if rc := C.dynamo_llm_init(ns, cm, C.int64_t(ffiWorkerID), C.uint32_t(ffiKvBlockSize)); rc != C.DYNAMO_OK {
+			ffiErr = fmt.Errorf("dynamo_llm_init failed")
+			return
+		}
+		runtimeInitialized = true
+
+		// Create persistent pipeline
+		pipelineMutex.Lock()
+		defer pipelineMutex.Unlock()
+
+		rc := C.dynamo_create_worker_selection_pipeline(
+			ns,
+			cm,
+			model,
+			C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_ROUTING", true)),
+			C.double(getEnvFloatOrDefault("DYNAMO_BUSY_THRESHOLD", -1.0)),
+			C.double(ffiOverlapScoreWeight),
+			C.double(ffiRouterTemperature),
+			C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_EVENTS", true)),
+			C.bool(getEnvBoolOrDefault("DYNAMO_ROUTER_REPLICA_SYNC", true)),
+			C.bool(ffiEnforceDisagg),
+			&pipeline,
+		)
+		if rc != C.DYNAMO_OK {
+			ffiErr = fmt.Errorf("dynamo_create_worker_selection_pipeline failed")
+			return
+		}
+	})
+	return ffiErr
+}
+
+// --------------------------- scoring ---------------------------
+
+func (k *KVAwareScorer) Score(
+	ctx context.Context,
+	cycleState *schedtypes.CycleState,
+	req *schedtypes.LLMRequest,
+	pods []schedtypes.Pod,
+) map[schedtypes.Pod]float64 {
+	logger := log.FromContext(ctx)
+
+	workerID, prefillWorkerID, tokenData, err := k.callDynamoRouter(ctx, req)
+	if err != nil {
+		logger.V(logutil.DEFAULT).Error(err, "Dynamo call failed; proceeding without worker id")
+	} else if workerID != "" {
+		logger.V(logutil.DEFAULT).Info(
+			"Dynamo router selected worker",
+			"workerID", workerID,
+			"prefillWorkerID", prefillWorkerID,
+			"tokenDataCount", len(tokenData),
+		)
+
+		// Store in request headers for the Lua filter at the gateway
+		if req.Headers == nil {
+			req.Headers = map[string]string{}
+		}
+		req.Headers[WorkerIDHeader] = workerID
+
+		// Set routing mode and prefill worker ID based on disaggregated vs aggregated
+		if prefillWorkerID != "" && prefillWorkerID != workerID {
+			// Disaggregated mode: separate prefill and decode workers
+			req.Headers[RoutingModeHeader] = "disaggregated"
+			req.Headers[PrefillWorkerIDHeader] = prefillWorkerID
+		} else {
+			// Aggregated mode: single worker handles both prefill and decode
+			req.Headers[RoutingModeHeader] = "aggregated"
+		}
+
+		// Store routing state for PreRequest to register with router bookkeeping.
+		// This is the correct place to store state - PreRequest is called AFTER
+		// scheduling is finalized, ensuring we only register committed requests.
+		if req.RequestId != "" {
+			routingState := &DynamoRoutingState{
+				WorkerID:        workerID,
+				PrefillWorkerID: prefillWorkerID,
+				// TokenData is stored for future use. Currently not passed to workers
+				// via headers (too large). May be passed via request body in future.
+				TokenData: tokenData,
+			}
+			k.pluginState.Write(req.RequestId, plugins.StateKey(stateKey), routingState)
+		}
+	}
+
+	out := make(map[schedtypes.Pod]float64, len(pods))
+	for _, p := range pods {
+		out[p] = 1.0
+	}
+	return out
+}
+
+// PreRequest is called after scheduling is finalized and before the request is sent to the worker.
+// This is the correct place to register the request with the Dynamo router's bookkeeping,
+// as we know the request WILL be dispatched (avoiding phantom bookkeeping entries).
+func (k *KVAwareScorer) PreRequest(
+	ctx context.Context,
+	request *schedtypes.LLMRequest,
+	schedulingResult *schedtypes.SchedulingResult,
+) {
+	logger := log.FromContext(ctx)
+
+	if request == nil || request.RequestId == "" {
+		logger.V(logutil.DEBUG).Info("PreRequest: no request ID, skipping router bookkeeping")
+		return
+	}
+
+	// Read and delete the routing state stored by Score()
+	state, err := plugins.ReadPluginStateKey[*DynamoRoutingState](
+		k.pluginState, request.RequestId, plugins.StateKey(stateKey),
+	)
+	k.pluginState.Delete(request.RequestId) // Clean up state after reading
+
+	if err != nil {
+		// No state found means Score() didn't store routing info (e.g., router call failed)
+		logger.V(logutil.DEBUG).Info("PreRequest: no routing state found, skipping router bookkeeping",
+			"requestID", request.RequestId)
+		return
+	}
+
+	// Register request with router bookkeeping now that scheduling is committed
+	if addErr := k.callAddRequest(ctx, request.RequestId, state.TokenData, state.WorkerID, state.PrefillWorkerID); addErr != nil {
+		logger.V(logutil.DEFAULT).Error(addErr, "PreRequest: failed to add request to router bookkeeping",
+			"requestID", request.RequestId)
+		return
+	}
+
+	logger.V(logutil.VERBOSE).Info("PreRequest: registered request with router bookkeeping",
+		"requestID", request.RequestId,
+		"workerID", state.WorkerID,
+		"prefillWorkerID", state.PrefillWorkerID,
+		"tokenCount", len(state.TokenData),
+	)
+}
+
+// ResponseComplete is called after the complete response is sent to the client.
+// It cleans up the router bookkeeping state for the completed request by calling
+// dynamo_router_free_request to release resources associated with the request.
+func (k *KVAwareScorer) ResponseComplete(
+	ctx context.Context,
+	request *schedtypes.LLMRequest,
+	response *rc.Response,
+	targetPod *backend.Pod,
+) {
+	logger := log.FromContext(ctx)
+
+	if request == nil {
+		logger.V(logutil.DEBUG).Info("ResponseComplete: request is nil, skipping cleanup")
+		return
+	}
+
+	requestID := request.RequestId
+	if requestID == "" {
+		logger.V(logutil.DEBUG).Info("ResponseComplete: no request ID, skipping cleanup")
+		return
+	}
+
+	// Call the dynamo router to free the request bookkeeping
+	if err := callFreeRequestInternal(requestID); err != nil {
+		logger.V(logutil.DEFAULT).Error(err, "ResponseComplete: failed to free request",
+			"requestID", requestID)
+		return
+	}
+
+	logger.V(logutil.VERBOSE).Info("ResponseComplete: freed request from router",
+		"requestID", requestID)
+}
+
+// --------------------------- router call (persistent only) ---------------------------
+
+func (k *KVAwareScorer) callDynamoRouter(
+	ctx context.Context,
+	req *schedtypes.LLMRequest,
+) (workerID string, prefillWorkerID string, tokenData []int64, err error) {
+	logger := log.FromContext(ctx)
+
+	if err := initFFI(); err != nil {
+		logger.V(logutil.DEFAULT).Error(err, "FFI init failed")
+		return "", "", nil, err
+	}
+	if !runtimeInitialized {
+		return "", "", nil, fmt.Errorf("dynamo runtime not initialized")
+	}
+
+	pipelineMutex.RLock()
+	currentPipeline := pipeline
+	pipelineMutex.RUnlock()
+
+	if currentPipeline == nil {
+		return "", "", nil, fmt.Errorf("dynamo worker selection pipeline not created")
+	}
+
+	// Build OpenAI-compatible JSON request from the new LLMRequest structure
+	requestBody := buildOpenAIRequest(req)
+	requestJSON, jsonErr := json.Marshal(requestBody)
+	if jsonErr != nil {
+		logger.V(logutil.DEFAULT).Error(jsonErr, "Failed to marshal OpenAI request")
+		return "", "", nil, fmt.Errorf("marshal OpenAI request: %w", jsonErr)
+	}
+	cRequestJSON := C.CString(string(requestJSON))
+	defer C.free(unsafe.Pointer(cRequestJSON))
+
+	// Output variables
+	var cDecodeWorkerID C.int64_t
+	var cPrefillWorkerID C.int64_t
+	var cTokens *C.uint32_t
+	var cTokenCount C.size_t
+	var cAnnotatedJSON *C.char
+
+	// Call the worker selection pipeline
+	rc := C.dynamo_query_worker_selection_and_annotate(
+		currentPipeline,
+		cRequestJSON,
+		&cDecodeWorkerID,
+		&cPrefillWorkerID,
+		&cTokens,
+		&cTokenCount,
+		&cAnnotatedJSON,
+	)
+	if rc != C.DYNAMO_OK {
+		return "", "", nil, fmt.Errorf("dynamo_query_worker_selection_and_annotate failed")
+	}
+
+	// Copy tokens into Go memory and free C memory
+	count := int(uintptr(cTokenCount))
+	var tokens64 []int64
+	if count > 0 && cTokens != nil {
+		src := unsafe.Slice((*uint32)(unsafe.Pointer(cTokens)), count)
+		tokens64 = make([]int64, count)
+		for i := 0; i < count; i++ {
+			tokens64[i] = int64(src[i])
+		}
+	}
+	C.dynamo_free_worker_selection_result(cTokens, cTokenCount, cAnnotatedJSON)
+
+	workerIDStr := fmt.Sprintf("%d", int64(cDecodeWorkerID))
+	prefillWorkerIDStr := ""
+	// Rust returns -1 for prefill_worker_id when not in disaggregated mode
+	if int64(cPrefillWorkerID) >= 0 {
+		prefillWorkerIDStr = fmt.Sprintf("%d", int64(cPrefillWorkerID))
+	}
+	logger.V(logutil.DEFAULT).Info("Worker selection completed",
+		"workerID", workerIDStr, "prefillWorkerID", prefillWorkerIDStr, "tokenCount", count)
+
+	return workerIDStr, prefillWorkerIDStr, tokens64, nil
+}
+
+// buildOpenAIRequest constructs an OpenAI-compatible request from the new LLMRequest structure
+func buildOpenAIRequest(req *schedtypes.LLMRequest) map[string]any {
+	requestBody := make(map[string]any)
+
+	// Extract prompt from the new Body structure
+	userText := "default prompt"
+	if req != nil && req.Body != nil {
+		if req.Body.ChatCompletions != nil && len(req.Body.ChatCompletions.Messages) > 0 {
+			// Extract text from chat completions messages
+			var sb strings.Builder
+			for _, msg := range req.Body.ChatCompletions.Messages {
+				sb.WriteString(msg.Content.PlainText())
+				sb.WriteString(" ")
+			}
+			userText = strings.TrimSpace(sb.String())
+		} else if req.Body.Completions != nil && req.Body.Completions.Prompt != "" {
+			userText = req.Body.Completions.Prompt
+		}
+	}
+
+	requestBody["messages"] = []map[string]any{{"role": "user", "content": userText}}
+	if req != nil && strings.TrimSpace(req.TargetModel) != "" {
+		requestBody["model"] = req.TargetModel
+	} else {
+		requestBody["model"] = ffiModel
+	}
+	requestBody["max_tokens"] = 1
+	requestBody["temperature"] = 0.0
+	requestBody["stream"] = true
+	requestBody["nvext"] = map[string]any{
+		"annotations": []string{"query_instance_id"},
+	}
+	return requestBody
+}
+
+// --------------------------- router bookkeeping ---------------------------
+
+// callAddRequest registers a request with the router's bookkeeping.
+// This should be called after worker selection to track active requests.
+func (k *KVAwareScorer) callAddRequest(
+	ctx context.Context,
+	requestID string,
+	tokenData []int64,
+	workerID string,
+	prefillWorkerID string,
+) error {
+	logger := log.FromContext(ctx)
+
+	if !runtimeInitialized {
+		return fmt.Errorf("dynamo runtime not initialized")
+	}
+
+	pipelineMutex.RLock()
+	currentPipeline := pipeline
+	pipelineMutex.RUnlock()
+
+	if currentPipeline == nil {
+		return fmt.Errorf("dynamo worker selection pipeline not created")
+	}
+
+	// Parse worker ID (use decode worker for bookkeeping in disagg mode)
+	var workerIDUint uint64
+	if _, err := fmt.Sscanf(workerID, "%d", &workerIDUint); err != nil {
+		return fmt.Errorf("invalid worker ID: %s", workerID)
+	}
+
+	// Convert token data from int64 to uint32
+	tokens := make([]uint32, len(tokenData))
+	for i, t := range tokenData {
+		tokens[i] = uint32(t)
+	}
+
+	cRequestID := C.CString(requestID)
+	defer C.free(unsafe.Pointer(cRequestID))
+
+	var cTokens *C.uint32_t
+	if len(tokens) > 0 {
+		cTokens = (*C.uint32_t)(unsafe.Pointer(&tokens[0]))
+	}
+
+	rc := C.dynamo_router_add_request(
+		currentPipeline,
+		cRequestID,
+		cTokens,
+		C.size_t(len(tokens)),
+		C.uint64_t(workerIDUint),
+		C.uint32_t(0), // dp_rank = 0 for now
+	)
+
+	if rc != C.DYNAMO_OK {
+		return fmt.Errorf("dynamo_router_add_request failed")
+	}
+
+	logger.V(logutil.VERBOSE).Info("Added request to router bookkeeping",
+		"requestID", requestID, "workerID", workerID, "tokenCount", len(tokens))
+	return nil
+}
+
+// CallMarkPrefillComplete marks prefill as completed for a request.
+// Exported for use by response handlers.
+func CallMarkPrefillComplete(requestID string) error {
+	if !runtimeInitialized {
+		return fmt.Errorf("dynamo runtime not initialized")
+	}
+
+	pipelineMutex.RLock()
+	currentPipeline := pipeline
+	pipelineMutex.RUnlock()
+
+	if currentPipeline == nil {
+		return fmt.Errorf("dynamo worker selection pipeline not created")
+	}
+
+	cRequestID := C.CString(requestID)
+	defer C.free(unsafe.Pointer(cRequestID))
+
+	rc := C.dynamo_router_mark_prefill_complete(currentPipeline, cRequestID)
+	if rc != C.DYNAMO_OK {
+		return fmt.Errorf("dynamo_router_mark_prefill_complete failed")
+	}
+	return nil
+}
+
+// callFreeRequestInternal cleans up router state for a completed/cancelled request.
+func callFreeRequestInternal(requestID string) error {
+	if !runtimeInitialized {
+		return fmt.Errorf("dynamo runtime not initialized")
+	}
+
+	pipelineMutex.RLock()
+	currentPipeline := pipeline
+	pipelineMutex.RUnlock()
+
+	if currentPipeline == nil {
+		return fmt.Errorf("dynamo worker selection pipeline not created")
+	}
+
+	cRequestID := C.CString(requestID)
+	defer C.free(unsafe.Pointer(cRequestID))
+
+	rc := C.dynamo_router_free_request(currentPipeline, cRequestID)
+	if rc != C.DYNAMO_OK {
+		return fmt.Errorf("dynamo_router_free_request failed")
+	}
+	return nil
+}
+
+// --------------------------- shutdown ---------------------------
+
+func cleanupDynamo() error {
+	pipelineMutex.Lock()
+	defer pipelineMutex.Unlock()
+
+	if pipeline != nil {
+		if rc := C.dynamo_destroy_worker_selection_pipeline(pipeline); rc != C.DYNAMO_OK {
+			fmt.Printf("Warning: dynamo_destroy_worker_selection_pipeline failed\n")
+		}
+		pipeline = nil
+	}
+
+	if runtimeInitialized {
+		if rc := C.dynamo_llm_shutdown(); rc != C.DYNAMO_OK {
+			return fmt.Errorf("dynamo_llm_shutdown failed")
+		}
+		runtimeInitialized = false
+	}
+	return nil
+}
--- a/deploy/inference-gateway/helm/dynamo-gaie/epp-config-dynamo.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/epp-config-dynamo.yaml
@@ -13,6 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# Dynamo EPP Configuration
+#
+# The KV scorer sets routing headers that the Lua filter at the gateway
+# reads to inject nvext into the request body:
+#   - x-worker-instance-id: Selected worker ID
+#   - x-prefiller-host-port: Prefill worker (disaggregated mode)
+#   - x-dynamo-routing-mode: "aggregated" or "disaggregated"
+
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
@@ -22,14 +30,15 @@ plugins:
  # Picker: chooses the final endpoint after scoring
  - name: picker
    type: max-score-picker
-  - name: dyn-pre
-    type: dynamo-inject-workerid
-    parameters: {}
+
+  # Dynamo KV-aware Scorer: calls Dynamo router FFI for worker selection
+  # Implements Scorer, PreRequest, and ResponseComplete:
+  # - Score: Selects workers based on KV cache, sets routing headers
+  # - PreRequest: Registers request with router bookkeeping
+  # - ResponseComplete: Frees router bookkeeping when response completes
  - name: dyn-kv
    type: kv-aware-scorer
-    parameters:
-      frontendURL: http://127.0.0.1:8000/v1/chat/completions
-      timeoutMS: 10000
+
 schedulingProfiles:
  - name: default
    plugins:

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role-binding.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role-binding.yaml
@@ -22,4 +22,5 @@ subjects:
  namespace: {{ .Release.Namespace }}
 roleRef:
  kind: ClusterRole
-  name: pod-read
\ No newline at end of file
+  name: pod-read
+  apiGroup: rbac.authorization.k8s.io
\ No newline at end of file
--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/cluster-role.yaml
@@ -19,10 +19,10 @@ metadata:
 rules:
 # Gateway API inference resources
 - apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencepools"]
+  resources: ["inferencepools", "inferenceobjectives", "inferencemodelrewrites"]
  verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
-  resources: ["inferencemodels"]
+- apiGroups: ["inference.networking.k8s.io"]
+  resources: ["inferencepools"]
  verbs: ["get", "watch", "list"]
 # Core resources for pod discovery
 - apiGroups: [""]

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 {{- /* ------------ file-scope vars (no output) ------------ */ -}}
 {{- $platformNs   := default .Release.Namespace .Values.platformNamespace -}}
 {{- $platformName := default "dynamo-platform" .Values.platformReleaseName -}}
@@ -23,10 +24,10 @@
 {{- $std          := .Values.extension.standardImage -}}
 {{- $dyn          := .Values.extension.dynamoImage -}}
 {{- $fallback     := ternary $dyn $std .Values.epp.useDynamo -}}
-{{- $eppImage     := default $fallback .Values.extension.image -}}
-
+{{- $eppImage     := default $fallback .Values.extension.image }}

---  # <-- start of actual YAML document
+---
+# Deployment for the EPP (Endpoint Picker Plugin)
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -61,26 +62,30 @@ spec:
        {{- if .Values.epp.argsOverride }}
        {{- toYaml .Values.epp.argsOverride | nindent 8 }}
        {{- else }}
-          - -poolName
+          - -pool-name
          - "{{ .Values.model.shortName }}-pool"
-          - -poolNamespace
+          - -pool-namespace
          - "{{ .Release.Namespace }}"
+          - -pool-group
+          - "inference.networking.x-k8s.io"
          - -v
          - "4"
          - --zap-encoder
          - "json"
-          - -grpcPort
+          - -grpc-port
          - "9002"
-          - -grpcHealthPort
+          - -grpc-health-port
          - "9003"
          {{- if $useDynamo }}
-          - -configFile
+          - -config-file
          - "{{ .Values.epp.configFile }}"
          {{- end }}
        {{- end }}

-        {{- if $useDynamo }}
        volumeMounts:
+          - name: hf-cache
+            mountPath: /home/nonroot/.cache
+        {{- if $useDynamo }}
          - name: epp-config
            mountPath: /etc/epp
            readOnly: true
@@ -117,11 +122,21 @@ spec:
            value: "true"
          - name: USE_STREAMING
            value: "true"
+          # HuggingFace token for downloading model config files
+          # Without this, HuggingFace rate-limits requests (429 Too Many Requests)
+          - name: HF_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: HF_TOKEN
+                optional: true
        {{- end }}
        {{- range .Values.epp.extraEnv }}
          - name: {{ .name }}
            value: {{ .value | quote }}
        {{- end }}
+          - name: RUST_LOG
+            value: "debug,dynamo_llm::kv_router=trace"

        ports:
          - containerPort: 9002
@@ -141,8 +156,10 @@ spec:
          initialDelaySeconds: 5
          periodSeconds: 10

-      {{- if $useDynamo }}
      volumes:
+        - name: hf-cache
+          emptyDir: {}
+      {{- if $useDynamo }}
        - name: epp-config
          configMap:
            name: {{ include "dynamo-gaie.fullname" . }}-epp-config

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/http-router.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/http-router.yaml
@@ -14,6 +14,8 @@
 # limitations under the License.

 {{- if .Values.httpRoute.enabled }}
+{{- /* Default gatewayNamespace to the release namespace if not specified */ -}}
+{{- $gatewayNs := default .Release.Namespace .Values.httpRoute.gatewayNamespace }}
 apiVersion: gateway.networking.k8s.io/v1
 kind: HTTPRoute
 metadata:
@@ -24,9 +26,10 @@ spec:
  - group: gateway.networking.k8s.io
    kind: Gateway
    name: {{ .Values.httpRoute.gatewayName }}
+    namespace: {{ $gatewayNs }}
  rules:
  - backendRefs:
-    - group: inference.networking.x-k8s.io
+    - group: inference.networking.k8s.io
      kind: InferencePool
      name: {{ .Values.model.shortName }}-pool
      namespace: {{ .Release.Namespace }}

--- a/deploy/inference-gateway/helm/dynamo-gaie/templates/inference-model.yaml
+++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/inference-model.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  name: {{ .Values.model.shortName }}-model
-  namespace: {{ .Release.Namespace }}
-spec:
-  criticality: {{ .Values.model.criticality }}
-  modelName: {{ .Values.model.identifier }}
-  poolRef:
-    group: inference.networking.x-k8s.io
-    kind: InferencePool
-    name: {{ .Values.model.shortName }}-pool
\ No newline at end of file