Unverified Commit 4810ad34 authored by atchernych's avatar atchernych Committed by GitHub
Browse files

feat: update GAIE to release version with hints in headers (#5503)


Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
parent b31b5b56
# SPDX-FileCopyrightText: Copyright The Kubernetes Authors.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modifications Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES
# Dockerfile.epp - Custom Dockerfile for GAIE EPP. This is to be used with the deploy/inference-gateway/build-epp-dynamo.sh
ARG DOCKER_PROXY
ARG BUILDER_IMAGE="golang:1.24"
ARG BASE_IMAGE="ubuntu:22.04"
############################
# Builder
############################
FROM ${DOCKER_PROXY}${BUILDER_IMAGE} AS builder
ENV CGO_ENABLED=1
# be explicit; helps cgo when linking libstdc++
ENV CC=gcc
ENV CXX=g++
# C/C++ toolchain for cgo, and libstdc++ for link-time
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
gcc g++ \
libc6-dev \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
ARG COMMIT_SHA=unknown
ARG BUILD_REF
WORKDIR /src
# deps first (cache)
COPY go.mod go.sum ./
RUN go mod download
# source
COPY cmd/epp ./cmd/epp
COPY pkg/epp ./pkg/epp
COPY internal ./internal
COPY api ./api
# sanity (optional)
RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/include/ || echo "Headers not found"
RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib/ || echo "Library not found"
# build
WORKDIR /src/cmd/epp
RUN go build \
-ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" \
-o /epp
############################
# Runtime
############################
FROM ${DOCKER_PROXY}${BASE_IMAGE} AS runtime
ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
# Minimal runtime deps; include libstdc++ runtime for -lstdc++
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
libstdc++6 \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r nonroot && useradd -r -g nonroot -m -d /home/nonroot nonroot \
&& mkdir -p /home/nonroot/.cache/huggingface/hub \
&& chown -R nonroot:nonroot /home/nonroot
WORKDIR /
COPY --from=builder /epp /epp
# Set HOME so ModelExpress can find the cache directory
ENV HOME=/home/nonroot
USER nonroot:nonroot
ENTRYPOINT ["/epp"]
......@@ -199,8 +199,8 @@ The frontend image is a specialized container that includes the Dynamo component
```
The build process automatically:
1. Clones the Gateway API Inference Extension (GAIE) repository
2. Builds the custom EPP image with Dynamo routing capabilities
1. Builds the Dynamo static library for EPP KV-aware routing
2. Builds the custom EPP Docker image using `make all` from `deploy/inference-gateway/epp/Makefile`
3. Builds the frontend image with the EPP binary and Dynamo runtime components
For more details, see [`deploy/inference-gateway/README.md`](../deploy/inference-gateway/README.md).
......
......@@ -138,10 +138,6 @@ SGLANG_CUDA_VERSION="12.9.1"
SGLANG_CUDA_VERSION_CU13="13.0.1"
SGLANG_RUNTIME_IMAGE_TAG_CU13="v0.5.7-cu130-runtime"
# GAIE (Gateway API Inference Extension) configuration for frontend (required for EPP binary for frontend image)
GAIE_REPO_URL="https://github.com/kubernetes-sigs/gateway-api-inference-extension.git"
GAIE_VERSION="v0.5.1"
PYTHON_VERSION="3.12"
NIXL_REF=0.8.0
......@@ -969,39 +965,33 @@ show_image_options
# Handle FRONTEND target: build EPP image first
if [[ ${TARGET^^} == "FRONTEND" ]]; then
echo "Building FRONTEND image - requires EPP image"
# Build base dynamo image first (framework=NONE, target=dev)
echo ""
echo "Building EPP image for Frontend..."
# Set up paths for GAIE
GAIE_CLONE_DIR="${BUILD_CONTEXT}/.build/external/gateway-api-inference-extension"
echo "Building EPP image for Frontend using Makefile..."
# Clone GAIE repo
echo ""
echo "Cloning GAIE repository at ${GAIE_VERSION}..."
$RUN_PREFIX rm -rf "${GAIE_CLONE_DIR}"
$RUN_PREFIX mkdir -p "$(dirname "${GAIE_CLONE_DIR}")"
$RUN_PREFIX git clone ${GAIE_REPO_URL} "${GAIE_CLONE_DIR}"
$RUN_PREFIX cd "${GAIE_CLONE_DIR}"
$RUN_PREFIX git checkout ${GAIE_VERSION}
$RUN_PREFIX cd "${BUILD_CONTEXT}"
# Build EPP image
echo ""
echo "Building EPP image..."
export GAIE_DIR="${GAIE_CLONE_DIR}"
export DYNAMO_DIR="${BUILD_CONTEXT}"
# EPP directory with the new self-contained build
EPP_DIR="${BUILD_CONTEXT}/deploy/inference-gateway/epp"
# Set DOCKER_PROXY from ECR_HOSTNAME if available (for pulling base images through proxy)
# This prevents rate-limiting when building in CI across multiple PRs
DOCKER_PROXY_ARG=""
if [[ -n "${ECR_HOSTNAME}" ]]; then
export DOCKER_PROXY="${ECR_HOSTNAME}/dockerhub/"
DOCKER_PROXY="${ECR_HOSTNAME}/dockerhub/"
DOCKER_PROXY_ARG="DOCKER_PROXY=${DOCKER_PROXY}"
echo "Using DOCKER_PROXY: ${DOCKER_PROXY}"
fi
$RUN_PREFIX bash ${DYNAMO_DIR}/deploy/inference-gateway/build-epp-dynamo.sh
# Set EPP image tag (matches what build-epp-dynamo.sh produces)
EPP_IMAGE_TAG="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:${GAIE_VERSION}-dirty"
# Build EPP image using the Makefile
# The Makefile handles: building Dynamo library, building Docker image, loading it locally
$RUN_PREFIX make -C "${EPP_DIR}" all DYNAMO_DIR="${BUILD_CONTEXT}" ${DOCKER_PROXY_ARG}
# Compute EPP image tag (must match Makefile's IMAGE_TAG)
# IMAGE_TAG = $(IMAGE_REPO):$(GIT_TAG)
# IMAGE_REPO = $(DOCKER_SERVER)/$(IMAGE_NAME)
# Image lives in local cache only, not pushed to any registry
EPP_DOCKER_SERVER="dynamo"
EPP_IMAGE_NAME="dynamo-epp"
EPP_GIT_TAG=$(git describe --tags --dirty --always 2>/dev/null || echo "dev")
EPP_IMAGE_TAG="${EPP_DOCKER_SERVER}/${EPP_IMAGE_NAME}:${EPP_GIT_TAG}"
echo "Successfully built EPP image: ${EPP_IMAGE_TAG}"
......
## Inference Gateway Setup with Dynamo
When integrating Dynamo with the Inference Gateway you could either use the default EPP image provided by the extension or use the custom Dynamo image.
When integrating Dynamo with the Inference Gateway it is recommended to use the custom Dynamo EPP image.
1. When using the Dynamo custom EPP image you will take advantage of the Dynamo router when EPP chooses the best worker to route the request to. This setup uses a custom Dynamo plugin `dyn-kv` to pick the best worker. In this case the Dynamo routing logic is moved upstream. We recommend this approach.
1. **Dynamo EPP (Recommended):** The custom Dynamo EPP image integrates the Dynamo router directly into the gateway's endpoint picker. Using the `dyn-kv` plugin, it selects the optimal worker based on KV cache state and tokenized prompt before routing the request. The integration moves intelligent routing upstream to the gateway layer.
2. When using the GAIE-provided image for the EPP, the Dynamo deployment is treated as a black box and the EPP would route round-robin. In this case GAIE just fans out the traffic, and the smarts only remain within the Dynamo graph. Use this if you have one Dynamo graph and do not want to obtain the Dynamo EPP image. This is a "backup" approach.
2. **Standard EPP (Fallback):** You can use the default GAIE EPP image, which treats the Dynamo deployment as a black box and routes requests round-robin. Routing intelligence remains within the Dynamo graph itself. Use this approach if you have a single Dynamo graph and don't need the custom EPP image.
EPP’s default kv-routing approach is not token-aware because the prompt is not tokenized. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).
The setup provided here uses the Dynamo custom EPP by default. Set `epp.useDynamo=false` in your deployment to pick the approach 2.
EPP’s default kv-routing approach is not token-aware because the prompt is hashed without tokenization. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).
Dynamo Integration with the Inference Gateway supports Aggregated and Disaggregated Serving.
If you want to use LoRA deploy Dynamo without the Inference Gateway or in the BlackBox approach with the Inference Gateway.
Currently, these setups are only supported with the kGateway based Inference Gateway.
......@@ -16,7 +19,19 @@ Currently, these setups are only supported with the kGateway based Inference Gat
- [Prerequisites](#prerequisites)
- [Installation Steps](#installation-steps)
- [Usage](#6-usage)
- [1. Install Dynamo Platform](#1-install-dynamo-platform)
- [2. Deploy Inference Gateway](#2-deploy-inference-gateway)
- [3. Deploy Your Model](#3-deploy-your-model)
- [4. Build EPP image](#4-build-epp-image)
- [5. Install Dynamo GAIE helm chart](#5-install-dynamo-gaie-helm-chart)
- [6. Verify Installation](#6-verify-installation)
- [7. Usage](#7-usage)
- [8. Deleting the installation](#8-deleting-the-installation)
- [Gateway API Inference Extension Details](#gateway-api-inference-extension-integration)
- [v1.2.1 API Changes](#v121-api-changes)
- [Building for v1.2.1](#building-for-v121)
- [Header-Only Routing for v1.2.1](#header-only-routing-for-v121)
## Prerequisites
......@@ -34,19 +49,22 @@ Currently, these setups are only supported with the kGateway based Inference Gat
First, deploy an inference gateway service. In this example, we'll install `kgateway` based gateway implementation.
```bash
./install_gaie_crd_kgateway.sh
cd deploy/inference-gateway
./scripts/install_gaie_crd_kgateway.sh
```
**Note**: The manifest at `config/manifests/gateway/kgateway/gateway.yaml` uses `gatewayClassName: agentgateway`, but kGateway's helm chart creates a GatewayClass named `kgateway`. The patch command in the script fixes this mismatch.
Verify installation:
#### f. Verify the Gateway is running
```bash
kubectl get gateway inference-gateway -n my-model
kubectl get gateway inference-gateway
# Sample output
# NAME CLASS ADDRESS PROGRAMMED AGE
# inference-gateway kgateway x.x.x.x True 1m
# inference-gateway kgateway True 1m
```
### 3. Deploy Your Model ###
Follow the steps in [model deployment](../../examples/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace.
......@@ -54,7 +72,8 @@ Follow the steps in [model deployment](../../examples/backends/vllm/deploy/READM
Sample commands to deploy model:
```bash
cd <dynamo-source-root>/examples/backends/vllm/deploy
cd <dynamo-source-root>
cd examples/backends/vllm/deploy
kubectl apply -f agg.yaml -n my-model
```
......@@ -83,14 +102,42 @@ Create a model configuration file similar to the vllm_agg_qwen.yaml for your mod
This file demonstrates the values needed for the Vllm Agg setup in [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml)
Take a note of the model's block size provided in the model card.
### 4. Install Dynamo GAIE helm chart ###
### 4. Build EPP image
You can either use the provided Dynamo FrontEnd image for the EPP image or you need to build your own Dynamo EPP custom image following the steps below.
```bash
# export env vars
export DOCKER_SERVER=ghcr.io/nvidia/dynamo # Container registry
export IMAGE_TAG=YOUR-TAG # Or auto from git tag
cd deploy/inference-gateway/epp
make all # Do everything in one command
# or make all-push to also push
# Or step-by-step
make dynamo-lib # Build Dynamo library and copy to project
make image-load # Build Docker image and load locally
make image-push # Build and push to registry
make info # Check image tag
```
#### All-in-one Targets
| Target | Description |
|--------|-------------|
| `make dynamo-lib` | Build Dynamo static library and copy to project |
| `make all` | Build Dynamo lib + Docker image + load locally |
| `make all-push` | Build Dynamo lib + Docker image + push to registry |
### 5. Install Dynamo GAIE helm chart ###
The Inference Gateway is configured through the `inference-gateway-resources.yaml` file.
Deploy the Inference Gateway resources to your Kubernetes cluster by running the command below.
```bash
cd deploy/inference-gateway
cd deploy/inference-gateway/
# Export the Dynamo image you have used when deploying your model in Step 3.
export DYNAMO_IMAGE=<the-dynamo-image-you-have-used-when-deploying-the-model>
......@@ -122,7 +169,7 @@ You can configure the plugin by setting environment vars in your [values-dynamo-
- Overwrite the `DYN_NAMESPACE` env var if needed to match your model's dynamo namespace.
- Set `DYNAMO_BUSY_THRESHOLD` to configure the upper bound on how “full” a worker can be (often derived from kv_active_blocks or other load metrics) before the router skips it. If the selected worker exceeds this value, routing falls back to the next best candidate. By default the value is negative meaning this is not enabled.
- Set `DYNAMO_ROUTER_REPLICA_SYNC=true` to enable a background watcher to keep multiple router instances in sync (important if you run more than one KV router per component).
- Set `DYNAMO_ENFORCE_DISAGG=true` if you want to enforce every request being served in the disaggregated manner. By default it is false meaning if the the prefill worker is not available the request will be served in the aggregated manner.
- By default the Dynamo plugin uses KV routing. You can expose `DYNAMO_USE_KV_ROUTING=false` in your [values-dynamo-epp.yaml] if you prefer to route in the round-robin fashion.
- If using kv-routing:
- Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-dynamo-epp.yaml](./values-dynamo-epp.yaml) to match your model's block size.The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures.
......@@ -132,52 +179,25 @@ You can configure the plugin by setting environment vars in your [values-dynamo-
- See the [KV cache routing design](../../docs/router/kv_cache_routing.md) for details.
Dynamo provides a custom routing plugin `pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go` to perform efficient kv routing.
The Dynamo router is built as a static library, the EPP router will call to provide fast inference.
You can either use the special FrontEnd image for the EPP_IMAGE in the Helm deployment command and proceed to the step 2 or you can build the image yourself following the steps below.
##### 1. Build the custom EPP image #####
If you choose to build your own image, use the `container/build.sh` script with the `--target frontend` option:
```bash
./container/build.sh --framework none --target frontend
```
This command automatically:
- Clones the Gateway API Inference Extension (GAIE) repository at the correct version
- Builds the Dynamo Router static library
- Applies the necessary patches to the EPP codebase
- Builds the custom EPP image with Dynamo KV routing support
- Builds the frontend image with the EPP binary and Dynamo runtime components
Re-tag the freshly built image and push it to your registry:
```bash
docker images
docker tag <your-new-id> <your-image-tag>
docker push <your-image-tag>
```
**Note**
You can also use the standard EPP image`us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.4.0`. For the basic black box integration run:
You can also use the standard EPP image i.e. `us-central1-docker.pkg.dev/k8s-artifacts-prod/images/gateway-api-inference-extension/epp:v1.2.1` for the basic black box integration.
```bash
cd deploy/inference-gateway
helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml
# Optionally export the standard EPP image if you do not want to use the default we suggest.
export EPP_IMAGE=us-central1-docker.pkg.dev/k8s-artifacts-prod/images/gateway-api-inference-extension/epp:v0.4.0
helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set epp.useDynamo=false
helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set epp.useDynamo=false --set-string extension.image=$EPP_IMAGE
# Optionally overwrite the image --set-string extension.image=$EPP_IMAGE
```
### 5. Verify Installation ###
### 6. Verify Installation ###
Check that all resources are properly deployed:
```bash
kubectl get inferencepool
kubectl get inferencemodel
kubectl get httproute
kubectl get service
kubectl get gateway
......@@ -190,16 +210,12 @@ Sample output:
NAME AGE
qwen-pool 33m
# kubectl get inferencemodel
NAME MODEL NAME INFERENCE POOL CRITICALITY AGE
qwen-model Qwen/Qwen3-0.6B qwen-pool Critical 33m
# kubectl get httproute
NAME HOSTNAMES AGE
qwen-route 33m
```
### 6. Usage ###
### 7. Usage ###
The Inference Gateway provides HTTP endpoints for model inference.
......@@ -310,11 +326,56 @@ Sample inference output:
}
```
### 7. Deleting the installation ###
### 8. Deleting the installation ###
If you need to uninstall run:
```bash
kubectl delete dynamoGraphDeployment vllm-agg
helm uninstall dynamo-gaie -n my-model
# To uninstall GAIE
# 1. Delete the inference-gateway
kubectl delete gateway inference-gateway --ignore-not-found
# 2. Uninstall kgateway helm releases
helm uninstall kgateway -n kgateway-system
helm uninstall kgateway-crds -n kgateway-system
# 3. Delete the kgateway-system namespace (optional, cleans up everything in it)
helm uninstall kgateway --namespace kgateway-system
kubectl delete namespace kgateway-system --ignore-not-found
# 4. Delete the Inference Extension CRDs
IGW_LATEST_RELEASE=v1.2.1
kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${IGW_LATEST_RELEASE}/manifests.yaml --ignore-not-found
# 5. Delete the Gateway API CRDs
GATEWAY_API_VERSION=v1.4.1
kubectl delete -f https://github.com/kubernetes-sigs/gateway-api/releases/download/$GATEWAY_API_VERSION/standard-install.yaml --ignore-not-found
```
## Gateway API Inference Extension Integration
This section documents the updated plugin implementation for Gateway API Inference Extension **v1.2.1**.
### v1.2.1 API Changes
### Building for v1.2.1
The plugin code for v1.2.1 is in:
- `pkg/plugins/dynamo_kv_scorer/plugin.go`
### Header-Only Routing for v1.2.1
In v1.2.1, the EPP uses a **header-only approach** for communicating routing decisions.
The plugins set HTTP headers that are forwarded to the backend workers.
#### Headers Set by Dynamo Plugins
| Header | Description | Set By |
|--------|-------------|--------|
| `x-worker-instance-id` | Primary worker ID (decode worker in disagg mode) | kv-aware-scorer |
| `x-prefill-instance-id` | Prefill worker ID (disaggregated mode only) | kv-aware-scorer |
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e # Exit on any error
# Configuration - Set these environment variables before running
if [[ -z "${DYNAMO_DIR}" ]]; then
echo "DYNAMO_DIR environment variable must be set"
echo " Example: export DYNAMO_DIR=/path/to/dynamo"
exit 1
fi
if [[ -z "${GAIE_DIR}" ]]; then
echo "GAIE_DIR environment variable must be set"
echo " Example: export GAIE_DIR=/path/to/gateway-api-inference-extension"
exit 1
fi
DYNAMO_LIB_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib"
DYNAMO_INCLUDE_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/include"
echo "Building Dynamo KV Router C Library..."
# Step 1: Build the static library
echo "Building static library..."
cd "${DYNAMO_DIR}"
cargo build --release -p libdynamo_llm
# Step 2: Generate header file (with fallback)
echo "Generating C header..."
HEADER_OUTPUT="${DYNAMO_DIR}/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h"
if ! cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm --output "${HEADER_OUTPUT}"; then
echo "cbindgen failed, using fallback header..."
cp "${DYNAMO_DIR}/lib/bindings/c/src/fallback_header.h" "${HEADER_OUTPUT}"
fi
# Step 3: Ensure directories exist
echo "Preparing directories..."
mkdir -p "${DYNAMO_LIB_DIR}"
mkdir -p "${DYNAMO_INCLUDE_DIR}"
# Step 4: Copy files to GAIE project
echo "Copying files to the GAIE project..."
cp "${HEADER_OUTPUT}" "${DYNAMO_INCLUDE_DIR}/"
cp "${DYNAMO_DIR}/target/release/libdynamo_llm_capi.a" "${DYNAMO_LIB_DIR}/"
cp "${DYNAMO_DIR}/container/Dockerfile.epp" "${GAIE_DIR}/Dockerfile.dynamo"
# Verify files were copied
if [[ ! -f "${DYNAMO_INCLUDE_DIR}/llm_engine.h" ]]; then
echo "Header file copy failed!"
exit 1
fi
if [[ ! -f "${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a" ]]; then
echo "Library file copy failed!"
exit 1
fi
if [[ ! -f "${GAIE_DIR}/Dockerfile.dynamo" ]]; then
echo "Docker.dynamo file copy failed!"
exit 1
fi
echo "Files copied successfully:"
echo " Header: ${DYNAMO_INCLUDE_DIR}/llm_engine.h"
echo " Library: ${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a"
echo " Docker: ${GAIE_DIR}/Dockerfile.epp"
# Step 5: Apply Dynamo patch (if it exists)
echo "Applying Dynamo patch..."
cd "${GAIE_DIR}"
PATCH_FILE="${DYNAMO_DIR}/deploy/inference-gateway/epp-patches/v0.8.0/gaie.patch"
if [[ -f "${PATCH_FILE}" ]]; then
if git apply --check "${PATCH_FILE}" 2>/dev/null; then
git apply "${PATCH_FILE}"
echo "Patch applied successfully"
else
echo "Patch doesn't apply cleanly - may already be applied or need manual resolution"
fi
else
echo "No patch file found at ${PATCH_FILE}"
fi
# Step 6: Build the EPP image
echo "Building the custom EPP image for GAIE..."
# Build make args - pass DOCKER_PROXY if set (e.g., from ECR_HOSTNAME)
MAKE_ARGS=""
if [[ -n "${DOCKER_PROXY}" ]]; then
echo "Using DOCKER_PROXY: ${DOCKER_PROXY}"
MAKE_ARGS+="DOCKER_PROXY=${DOCKER_PROXY} "
fi
make ${MAKE_ARGS} dynamo-image-local-load
echo "EPP image with Dynamo KV routing built"
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index b5e0617..8592735 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -22,6 +22,11 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
+ eppplugins "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+
+ // Dynamo plugins
+ dynprereq "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid"
+ dynscorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/dynamo_kv_scorer"
)
func main() {
@@ -30,6 +35,9 @@ func main() {
// For adding out-of-tree plugins to the plugins registry, use the following:
// plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function)
+ eppplugins.Register("dynamo-inject-workerid", dynprereq.InjectWorkerIDPreRequestFactory)
+ eppplugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
+
if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
os.Exit(1)
}
diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go
index 32fffc0..1aa1b85 100644
--- a/pkg/bbr/handlers/request.go
+++ b/pkg/bbr/handlers/request.go
@@ -18,8 +18,10 @@ package handlers
import (
"context"
+ "encoding/base64"
"encoding/json"
"fmt"
+ "strings"
basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
@@ -31,11 +33,49 @@ import (
const modelHeader = "X-Gateway-Model-Name"
+// Dynamo-related
+const (
+ workerIDHeader = "x-worker-instance-id"
+ injectHintHeader = "x-epp-inject-nvext-worker-instance-id"
+ tokenDataHeader = "x-epp-inject-nvext-token-data"
+)
+
// HandleRequestBody handles request bodies.
func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]*eppb.ProcessingResponse, error) {
logger := log.FromContext(ctx)
var ret []*eppb.ProcessingResponse
+ // If we captured a worker id hint in the headers phase, inject it into body JSON:
+ // nvext.backend_instance_id = <workerID>
+ if wid := strings.TrimSpace(s.workerIDHint); wid != "" {
+ // ensure nvext is a map[string]any
+ if nv, ok := data["nvext"]; !ok || nv == nil {
+ data["nvext"] = map[string]any{"backend_instance_id": wid}
+ } else if m, ok := nv.(map[string]any); ok {
+ m["backend_instance_id"] = wid
+ } else {
+ // if nvext was some other type, replace with a clean map
+ data["nvext"] = map[string]any{"backend_instance_id": wid}
+ }
+ }
+
+ // If we captured token_data in headers, decode and inject as nvext.token_data
+ if td := strings.TrimSpace(s.tokenDataHint); td != "" {
+ // header value is base64(JSON array)
+ if raw, err := base64.StdEncoding.DecodeString(td); err == nil {
+ var arr []int64
+ if err := json.Unmarshal(raw, &arr); err == nil && len(arr) > 0 {
+ // ensure nvext map exists
+ nv, ok := data["nvext"].(map[string]any)
+ if !ok || nv == nil {
+ nv = map[string]any{}
+ data["nvext"] = nv
+ }
+ nv["token_data"] = arr
+ }
+ }
+ }
+
requestBodyBytes, err := json.Marshal(data)
if err != nil {
return nil, err
@@ -46,6 +86,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
metrics.RecordModelNotInBodyCounter()
logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter")
if s.streaming {
+ // still stream the possibly mutated body
ret = append(ret, &eppb.ProcessingResponse{
Response: &eppb.ProcessingResponse_RequestHeaders{
RequestHeaders: &eppb.HeadersResponse{},
@@ -53,14 +94,24 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
})
ret = addStreamedBodyResponse(ret, requestBodyBytes)
return ret, nil
- } else {
- ret = append(ret, &eppb.ProcessingResponse{
+ }
+
+ // non-streaming: return a body response with the (possibly) mutated body
+ return []*eppb.ProcessingResponse{
+ {
Response: &eppb.ProcessingResponse_RequestBody{
- RequestBody: &eppb.BodyResponse{},
+ RequestBody: &eppb.BodyResponse{
+ Response: &eppb.CommonResponse{
+ BodyMutation: &eppb.BodyMutation{
+ Mutation: &eppb.BodyMutation_Body{
+ Body: requestBodyBytes,
+ },
+ },
+ },
+ },
},
- })
- }
- return ret, nil
+ },
+ }, nil
}
modelStr, ok := modelVal.(string)
@@ -73,6 +124,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
metrics.RecordSuccessCounter()
if s.streaming {
+ // set the model header, then stream the (possibly) mutated body
ret = append(ret, &eppb.ProcessingResponse{
Response: &eppb.ProcessingResponse_RequestHeaders{
RequestHeaders: &eppb.HeadersResponse{
@@ -86,16 +138,42 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
RawValue: []byte(modelStr),
},
},
+ // also keep the worker id header if we have one
+ func() *basepb.HeaderValueOption {
+ if strings.TrimSpace(s.workerIDHint) == "" {
+ return nil
+ }
+ return &basepb.HeaderValueOption{
+ Header: &basepb.HeaderValue{
+ Key: workerIDHeader,
+ RawValue: []byte(s.workerIDHint),
+ },
+ }
+ }(),
},
},
},
},
},
})
+
+ // prune nil entries if worker id not present
+ hm := ret[len(ret)-1].GetRequestHeaders().GetResponse().GetHeaderMutation()
+ if hm != nil && hm.SetHeaders != nil {
+ out := hm.SetHeaders[:0]
+ for _, h := range hm.SetHeaders {
+ if h != nil {
+ out = append(out, h)
+ }
+ }
+ hm.SetHeaders = out
+ }
+
ret = addStreamedBodyResponse(ret, requestBodyBytes)
return ret, nil
}
+ // Non-streaming: set model header and replace the body with our mutated JSON
return []*eppb.ProcessingResponse{
{
Response: &eppb.ProcessingResponse_RequestBody{
@@ -111,6 +189,22 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
RawValue: []byte(modelStr),
},
},
+ func() *basepb.HeaderValueOption {
+ if strings.TrimSpace(s.workerIDHint) == "" {
+ return nil
+ }
+ return &basepb.HeaderValueOption{
+ Header: &basepb.HeaderValue{
+ Key: workerIDHeader,
+ RawValue: []byte(s.workerIDHint),
+ },
+ }
+ }(),
+ },
+ },
+ BodyMutation: &eppb.BodyMutation{
+ Mutation: &eppb.BodyMutation_Body{
+ Body: requestBodyBytes,
},
},
},
@@ -141,6 +235,32 @@ func addStreamedBodyResponse(responses []*eppb.ProcessingResponse, requestBodyBy
// HandleRequestHeaders handles request headers.
func (s *Server) HandleRequestHeaders(headers *eppb.HttpHeaders) ([]*eppb.ProcessingResponse, error) {
+ // reset per-request
+ s.workerIDHint = ""
+ s.tokenDataHint = ""
+
+ if m := headers.GetHeaders(); m != nil {
+ for _, h := range m.GetHeaders() {
+ k := strings.ToLower(h.GetKey())
+
+ switch k {
+ case injectHintHeader, workerIDHeader:
+ if rv := h.GetRawValue(); len(rv) > 0 {
+ s.workerIDHint = strings.TrimSpace(string(rv))
+ } else {
+ s.workerIDHint = strings.TrimSpace(h.GetValue())
+ }
+ case tokenDataHeader:
+ if rv := h.GetRawValue(); len(rv) > 0 {
+ s.tokenDataHint = strings.TrimSpace(string(rv))
+ } else {
+ s.tokenDataHint = strings.TrimSpace(h.GetValue())
+ }
+ }
+ }
+ }
+
+ // No header mutations needed here; body phase will do the JSON injection.
return []*eppb.ProcessingResponse{
{
Response: &eppb.ProcessingResponse_RequestHeaders{
diff --git a/pkg/bbr/handlers/server.go b/pkg/bbr/handlers/server.go
index a580380..eb2893f 100644
--- a/pkg/bbr/handlers/server.go
+++ b/pkg/bbr/handlers/server.go
@@ -38,7 +38,9 @@ func NewServer(streaming bool) *Server {
// Server implements the Envoy external processing server.
// https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto
type Server struct {
- streaming bool
+ streaming bool
+ workerIDHint string
+ tokenDataHint string
}
func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
diff --git a/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
new file mode 100644
index 0000000..b6708fa
--- /dev/null
+++ b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
@@ -0,0 +1,69 @@
+package dynamo_inject_workerid
+
+import (
+ "context"
+ "encoding/json"
+ "strings"
+
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+ rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
+ schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+const (
+ typeString = "dynamo-inject-workerid"
+ pluginName = "dynamo-inject-workerid"
+ WorkerIDHeader = "x-worker-instance-id"
+ injectHintHeader = "x-epp-inject-nvext-worker-instance-id"
+ TokenDataHeader = "x-epp-inject-nvext-token-data"
+)
+
+var _ plugins.Plugin = (*InjectWorkerIDPreRequest)(nil)
+var _ rc.PreRequest = (*InjectWorkerIDPreRequest)(nil)
+
+type InjectWorkerIDPreRequest struct {
+ typedName plugins.TypedName
+}
+
+func NewInjectWorkerIDPreRequest() *InjectWorkerIDPreRequest {
+ return &InjectWorkerIDPreRequest{
+ typedName: plugins.TypedName{Type: typeString, Name: pluginName},
+ }
+}
+
+func (p *InjectWorkerIDPreRequest) WithName(name string) *InjectWorkerIDPreRequest {
+ p.typedName.Name = name
+ return p
+}
+
+func InjectWorkerIDPreRequestFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+ return NewInjectWorkerIDPreRequest().WithName(name), nil
+}
+
+func (p *InjectWorkerIDPreRequest) TypedName() plugins.TypedName { return p.typedName }
+
+func (p *InjectWorkerIDPreRequest) PreRequest(
+ _ context.Context,
+ req *schedtypes.LLMRequest,
+ _ *schedtypes.SchedulingResult,
+ _ int,
+) {
+ if req == nil {
+ return
+ }
+ if req.Headers == nil {
+ req.Headers = map[string]string{}
+ }
+ wid := strings.TrimSpace(req.Headers[WorkerIDHeader])
+ if wid == "" {
+ return
+ }
+ req.Headers[WorkerIDHeader] = wid
+ req.Headers[injectHintHeader] = wid
+
+ // Pass through token-data header if scorer set it
+ if td := strings.TrimSpace(req.Headers[TokenDataHeader]); td != "" {
+ req.Headers[TokenDataHeader] = td
+ }
+
+}
diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
new file mode 100644
index 0000000..2d92be0
--- /dev/null
+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
@@ -0,0 +1,24 @@
+# This is an example for configuring the EPP to use the dynamo token-aware kv router for scoring the pods
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+ # Required: tells EPP which profile to use (even if you only have one)
+ - type: single-profile-handler
+
+ # Picker: chooses the final endpoint after scoring
+ - name: picker
+ type: max-score-picker
+ - name: dyn-pre
+ type: dynamo-inject-workerid
+ parameters: {}
+ - name: dyn-kv
+ type: kv-aware-scorer
+ parameters:
+ frontendURL: http://127.0.0.1:8000/v1/chat/completions
+ timeoutMS: 10000
+schedulingProfiles:
+ - name: default
+ plugins:
+ - pluginRef: dyn-kv
+ weight: 1
+ - pluginRef: picker
diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
new file mode 100644
index 0000000..50eb5f6
--- /dev/null
+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
@@ -0,0 +1,431 @@
+package dynamo_kv_scorer
+
+import (
+ "bufio"
+ "bytes"
+ "context"
+ "encoding/base64"
+ "encoding/json"
+ "fmt"
+ "io"
+ "net/http"
+ "strings"
+ "time"
+
+ log "sigs.k8s.io/controller-runtime/pkg/log"
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
+ schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+ logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+const (
+ PluginName = "dynamo-kv-scorer"
+ KVAwareScorerType = "kv-aware-scorer"
+ StateKeyWorkerInstanceID = schedtypes.StateKey("dynamo/worker-instance-id")
+ WorkerIDHeader = "x-worker-instance-id"
+ TokenDataHeader = "x-epp-inject-nvext-token-data"
+)
+
+type params struct {
+ FrontendURL string `json:"frontendURL"`
+ TimeoutMS int `json:"timeoutMS"`
+}
+
+// tiny wrapper so we can store a string in CycleState
+type stateString string
+
+func (s stateString) Clone() schedtypes.StateData { return s }
+
+type KVAwareScorer struct {
+ typedName plugins.TypedName
+ feURL string
+ feTimeout time.Duration
+}
+
+// compile-time assertions
+var _ plugins.Plugin = (*KVAwareScorer)(nil)
+var _ framework.Scorer = (*KVAwareScorer)(nil)
+
+func NewKVAwareScorer() *KVAwareScorer {
+ return &KVAwareScorer{
+ typedName: plugins.TypedName{Type: KVAwareScorerType, Name: PluginName},
+ feURL: "http://127.0.0.1:8000/v1/chat/completions",
+ feTimeout: 10 * time.Second,
+ }
+}
+
+func (k *KVAwareScorer) WithName(name string) *KVAwareScorer { k.typedName.Name = name; return k }
+func (k *KVAwareScorer) WithFrontend(url string, timeout time.Duration) *KVAwareScorer {
+ if url != "" {
+ k.feURL = url
+ }
+ if timeout > 0 {
+ k.feTimeout = timeout
+ }
+ return k
+}
+
+func KVAwareScorerFactory(name string, raw json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+ p := params{}
+ _ = json.Unmarshal(raw, &p)
+ timeout := time.Duration(p.TimeoutMS) * time.Millisecond
+ if timeout <= 0 {
+ timeout = 10 * time.Second
+ }
+ return NewKVAwareScorer().WithName(name).WithFrontend(p.FrontendURL, timeout), nil
+}
+
+func (k *KVAwareScorer) TypedName() plugins.TypedName { return k.typedName }
+
+func (k *KVAwareScorer) Score(
+ ctx context.Context,
+ cycle *schedtypes.CycleState,
+ req *schedtypes.LLMRequest,
+ pods []schedtypes.Pod,
+) map[schedtypes.Pod]float64 {
+ logger := log.FromContext(ctx)
+
+ workerID, tokenData, err := k.callFrontEndForWorker(ctx, req)
+ if err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "FrontEnd call failed; proceeding without worker id")
+ } else if workerID != "" {
+ cycle.Write(StateKeyWorkerInstanceID, stateString(workerID))
+ if req.Headers == nil {
+ req.Headers = map[string]string{}
+ }
+ req.Headers[WorkerIDHeader] = workerID
+ if len(tokenData) > 0 {
+ if req.Headers == nil {
+ req.Headers = map[string]string{}
+ }
+ req.Headers[TokenDataHeader] = encodeTokenData(tokenData)
+ }
+ }
+
+ // neutral/uniform scores – only your scorer runs in the profile, so this “wins”
+ out := make(map[schedtypes.Pod]float64, len(pods))
+ for _, p := range pods {
+ out[p] = 1.0
+ }
+ return out
+}
+
+// Call the Dynamo FrontEnd and extract worker_instance_id via SSE.
+func (k *KVAwareScorer) callFrontEndForWorker(
+ ctx context.Context,
+ req *schedtypes.LLMRequest,
+) (string, []int64, error) {
+ logger := log.FromContext(ctx)
+
+ feBody := buildFrontEndBodyFromLLMRequest(req)
+ payload, err := json.Marshal(feBody)
+ if err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "Dynamo FrontEnd marshal failed")
+ return "", nil, fmt.Errorf("marshal FrontEnd body: %w", err)
+ }
+
+ reqCtx, cancel := context.WithTimeout(ctx, k.feTimeout)
+ defer cancel()
+
+ httpReq, err := http.NewRequestWithContext(reqCtx, http.MethodPost, k.feURL, bytes.NewReader(payload))
+ if err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "Dynamo FrontEnd request build failed")
+ return "", nil, fmt.Errorf("build FrontEnd request: %w", err)
+ }
+ httpReq.Header.Set("Content-Type", "application/json")
+ httpReq.Header.Set("Accept", "text/event-stream")
+
+ client := &http.Client{Timeout: 0}
+ resp, err := client.Do(httpReq)
+ if err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "Dynamo FrontEnd POST failed")
+ return "", nil, fmt.Errorf("FrontEnd POST failed: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+ errBody, _ := io.ReadAll(resp.Body)
+ logger.V(logutil.DEFAULT).Error(nil, "Dynamo FrontEnd non-2xx response",
+ "status_code", resp.StatusCode, "response_body", string(errBody))
+ return "", nil, fmt.Errorf("Dynamo FrontEnd error: %d body=%s", resp.StatusCode, string(errBody))
+ }
+
+ ct := strings.ToLower(resp.Header.Get("Content-Type"))
+ if !strings.Contains(ct, "text/event-stream") {
+ logger.V(logutil.DEFAULT).Error(nil, "Unexpected non-SSE response")
+ return "", nil, fmt.Errorf("unexpected non-SSE response (Content-Type=%q)", resp.Header.Get("Content-Type"))
+ }
+
+ // Parse SSE: expect `event: worker_instance_id`, a quoted id in a comment or data, and `data: [DONE]`
+ reader := bufio.NewReader(resp.Body)
+ workerID, tokenData, perr := parseSelectionFromSSE(ctx, reader)
+ if perr != nil {
+ return "", nil, perr
+ }
+ return workerID, tokenData, nil
+}
+
+// Build the exact body we send to the FrontEnd, only from LLMRequest (no header merging).
+func buildFrontEndBodyFromLLMRequest(req *schedtypes.LLMRequest) map[string]any {
+ feBody := make(map[string]any, 8)
+
+ // We call /v1/chat/completions so must provide messages
+ userText := ""
+ if req != nil && strings.TrimSpace(req.Prompt) != "" {
+ userText = req.Prompt
+ }
+ feBody["messages"] = []map[string]any{
+ {"role": "user", "content": userText},
+ }
+
+ if req != nil && strings.TrimSpace(req.TargetModel) != "" {
+ feBody["model"] = req.TargetModel
+ }
+
+ // Force SSE so we can parse worker_instance_id
+ feBody["stream"] = true
+
+ feBody["max_tokens"] = 1
+ feBody["temperature"] = 0.0
+
+ // Ask the Dynamo to include worker id
+ feBody["nvext"] = map[string]any{
+ "annotations": []string{"query_instance_id"},
+ }
+
+ return feBody
+}
+
+// This function scans an SSE stream for a worker_instance_id and token_data.
+// Expected pattern:
+//
+// event: worker_instance_id
+// : "8303679623149182543"
+// data: [DONE]
+
+// or with tokens:
+// event: worker_instance_id\n: \"8228244551594056720\"\n\n
+// event: token_data\n: \"[151644,872,198,151644,872,198,14990,151645,198,151645,198,151644,77091,198]\
+// "\n\ndata: [DONE]\n\n"
+// Also supports JSON in data lines with either top-level worker_instance_id
+// or annotations.worker_instance_id.
+func parseSelectionFromSSE(ctx context.Context, reader *bufio.Reader) (string, []int64, error) {
+ logger := log.FromContext(ctx)
+
+ var (
+ eventName string
+ dataBuf strings.Builder // accumulates "data:" lines for one event
+ commentBuf strings.Builder // accumulates ":" comment lines
+ gotWID string
+ gotTD []int64
+ )
+
+ // collect the exact SSE bytes for debugging
+ var rawBuf strings.Builder
+
+ flushEvent := func() (bool, error) {
+ data := strings.TrimSpace(dataBuf.String())
+ comment := strings.TrimSpace(commentBuf.String())
+ dataBuf.Reset()
+ commentBuf.Reset()
+
+ // [DONE] ends the stream
+ if data == "[DONE]" || comment == "[DONE]" {
+ logger.V(logutil.DEFAULT).Info("SSE stream DONE")
+ logger.V(logutil.DEFAULT).Info("SSE raw stream", "raw", rawBuf.String())
+ if gotWID != "" && len(gotTD) == 0 {
+ logger.V(logutil.DEFAULT).Info("SSE DONE: worker_instance_id present, token_data missing")
+ }
+ return true, nil
+ }
+
+ // Prefer the named event
+ if eventName == "worker_instance_id" {
+ candidate := data
+ if candidate == "" {
+ candidate = comment
+ }
+ if candidate != "" {
+ // Try JSON string
+ var s string
+ if json.Unmarshal([]byte(candidate), &s) == nil && s != "" {
+ logger.V(logutil.VERBOSE).Info("worker_instance_id extracted from named event", "worker_instance_id", s)
+ gotWID = s
+ return false, nil
+ }
+ // Fallback: strip quotes
+ clean := strings.Trim(candidate, "\"")
+ if clean != "" && clean != "[DONE]" {
+ logger.V(logutil.DEFAULT).Info("worker_instance_id extracted (raw) from named event", "worker_instance_id", clean)
+ gotWID = clean
+ return false, nil
+ }
+ }
+ }
+
+ if eventName == "token_data" {
+ candidate := data
+ if candidate == "" {
+ candidate = comment
+ }
+ if candidate != "" {
+ if arr := toInt64SliceJSON(candidate); len(arr) > 0 {
+ gotTD = arr
+ logger.V(logutil.DEFAULT).Info("token_data extracted from named event", "count", len(arr))
+ return false, nil
+ }
+ }
+ }
+ // Generic JSON in data:
+ if data != "" {
+ var msg map[string]any
+ if json.Unmarshal([]byte(data), &msg) == nil {
+ if wid, ok := msg["worker_instance_id"].(string); ok && wid != "" {
+ logger.V(logutil.DEFAULT).Info("worker_instance_id found in SSE payload root", "worker_instance_id", wid)
+ gotWID = wid
+ }
+ if ann, ok := msg["annotations"].(map[string]any); ok {
+ if wid, ok := ann["worker_instance_id"].(string); ok && wid != "" {
+ logger.V(logutil.DEFAULT).Info("worker_instance_id found in SSE annotations", "worker_instance_id", wid)
+ gotWID = wid
+ }
+ }
+ if td, ok := msg["token_data"]; ok {
+ if arr := toInt64Slice(td); len(arr) > 0 {
+ gotTD = arr
+ logger.V(logutil.DEFAULT).Info("token_data found in SSE payload root", "count", len(arr))
+ }
+ } else if nv, ok := msg["nvext"].(map[string]any); ok {
+ if td, ok := nv["token_data"]; ok {
+ if arr := toInt64Slice(td); len(arr) > 0 {
+ gotTD = arr
+ logger.V(logutil.DEFAULT).Info("token_data found in SSE nvext", "count", len(arr))
+ }
+ }
+ }
+ }
+ }
+ return false, nil
+ }
+
+ for {
+ line, err := reader.ReadString('\n')
+ // capture the raw stream as-is for debugging
+ rawBuf.WriteString(line)
+ if err != nil {
+ if err == io.EOF {
+ _, _ = flushEvent()
+ logger.V(logutil.DEFAULT).Info("SSE raw stream (EOF)", "raw", rawBuf.String())
+ if gotWID != "" && len(gotTD) == 0 {
+ logger.V(logutil.DEFAULT).Info("EOF: worker_instance_id present, token_data missing")
+ }
+ if gotWID != "" || len(gotTD) > 0 {
+ return gotWID, gotTD, nil
+ }
+ logger.V(logutil.DEFAULT).Error(nil, "EOF before selection fields present")
+ return "", nil, fmt.Errorf("selection not found in SSE stream (EOF)")
+ }
+ logger.V(logutil.DEFAULT).Error(err, "SSE read error")
+ return "", nil, fmt.Errorf("sse read error: %w", err)
+ }
+
+ l := strings.TrimRight(line, "\r\n")
+ if l == "" {
+ // End of current event.
+ if done, _ := flushEvent(); done {
+ if gotWID != "" && len(gotTD) == 0 {
+ logger.V(logutil.DEFAULT).Info("SSE DONE: worker_instance_id present, token_data missing")
+ }
+ return gotWID, gotTD, nil
+ }
+ eventName = "" // reset for next event
+ continue
+ }
+
+ // Comment line
+ if strings.HasPrefix(l, ":") {
+ commentLine := strings.TrimSpace(l[1:])
+ if commentBuf.Len() > 0 {
+ commentBuf.WriteByte('\n')
+ }
+ commentBuf.WriteString(commentLine)
+ continue
+ }
+
+ // "field: value"
+ if idx := strings.IndexByte(l, ':'); idx != -1 {
+ field := l[:idx]
+ val := strings.TrimSpace(l[idx+1:])
+ switch field {
+ case "event":
+ eventName = val
+ case "data":
+ if dataBuf.Len() > 0 {
+ dataBuf.WriteByte('\n')
+ }
+ dataBuf.WriteString(val)
+ default:
+ // ignore id, retry, etc.
+ }
+ }
+ }
+}
+
+// encodeTokenData turns []int64 into base64(JSON array) for a safe header value.
+func encodeTokenData(tokens []int64) string {
+ b, _ := json.Marshal(tokens)
+ return base64.StdEncoding.EncodeToString(b)
+}
+
+// Accepts interface{} from a parsed JSON map
+func toInt64Slice(v any) []int64 {
+ xs, ok := v.([]any)
+ if !ok {
+ return nil
+ }
+ out := make([]int64, 0, len(xs))
+ for _, it := range xs {
+ switch n := it.(type) {
+ case float64:
+ out = append(out, int64(n))
+ case int64:
+ out = append(out, n)
+ case json.Number:
+ if i, err := n.Int64(); err == nil {
+ out = append(out, i)
+ }
+ }
+ }
+ return out
+}
+
+// Accepts raw JSON (string) for events like:
+// event: worker_instance_id\n: \"8228244551594056720\"\n\n
+// event: token_data\n: \"[151644,872,198,151644,872,198,14990,151645,198,151645,198,151644,77091,198]\
+// "\n\ndata: [DONE]\n\n"
+// replaces the old toInt64SliceJSON
+func toInt64SliceJSON(s string) []int64 {
+ // case 1: direct JSON array
+ var arr []int64
+ if err := json.Unmarshal([]byte(s), &arr); err == nil && len(arr) > 0 {
+ return arr
+ }
+ // case 2: s is a JSON string that itself contains a JSON array
+ var inner string
+ if err := json.Unmarshal([]byte(s), &inner); err == nil && inner != "" {
+ var arr2 []int64
+ if err := json.Unmarshal([]byte(inner), &arr2); err == nil && len(arr2) > 0 {
+ return arr2
+ }
+ }
+ // case 3: strip quotes and try once more
+ unquoted := strings.Trim(s, "\"")
+ if unquoted != s {
+ var arr3 []int64
+ if err := json.Unmarshal([]byte(unquoted), &arr3); err == nil && len(arr3) > 0 {
+ return arr3
+ }
+ }
+ return nil
+}
diff --git a/Makefile b/Makefile
index dee7e99..4679ce2 100644
--- a/Makefile
+++ b/Makefile
@@ -170,6 +170,48 @@ verify-all:
##@ Build
+##@ Dynamo EPP with FFI
+
+# Build the Dynamo EPP image with CGO static library support
+.PHONY: dynamo-image-local-build
+dynamo-image-local-build: ## Build the Dynamo EPP image using Docker Buildx for local development.
+ BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
+ $(MAKE) dynamo-image-build PUSH=$(PUSH)
+ $(MAKE) dynamo-image-build LOAD=$(LOAD)
+ $(DOCKER_BUILDX_CMD) rm $$BUILDER
+
+.PHONY: dynamo-image-local-push
+dynamo-image-local-push: PUSH=--push ## Build the Dynamo EPP image for local development and push it to $IMAGE_REPO.
+dynamo-image-local-push: dynamo-image-local-build
+
+.PHONY: dynamo-image-local-load
+dynamo-image-local-load: LOAD=--load ## Build the Dynamo EPP image for local development and load it in the local Docker registry.
+dynamo-image-local-load: dynamo-image-local-build
+
+.PHONY: dynamo-image-build
+dynamo-image-build: ## Build the Dynamo EPP image using Docker Buildx with CGO support.
+ $(IMAGE_BUILD_CMD) -f Dockerfile.dynamo -t $(IMAGE_TAG) \
+ --platform=$(PLATFORMS) \
+ --build-arg BASE_IMAGE=ubuntu:22.04 \
+ --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
+ --build-arg COMMIT_SHA=${GIT_COMMIT_SHA} \
+ --build-arg BUILD_REF=${BUILD_REF} \
+ $(PUSH) \
+ $(LOAD) \
+ $(IMAGE_BUILD_EXTRA_OPTS) ./
+
+.PHONY: dynamo-image-push
+dynamo-image-push: PUSH=--push ## Build the Dynamo EPP image and push it to $IMAGE_REPO.
+dynamo-image-push: dynamo-image-build
+
+.PHONY: dynamo-image-load
+dynamo-image-load: LOAD=--load ## Build the Dynamo EPP image and load it in the local Docker registry.
+dynamo-image-load: dynamo-image-build
+
+.PHONY: dynamo-image-kind
+dynamo-image-kind: dynamo-image-build ## Build the Dynamo EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default).
+ kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
+
# Build the container image
.PHONY: image-local-build
image-local-build: ## Build the EPP image using Docker Buildx for local development.
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index b5e0617..8592735 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -22,6 +22,11 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
+ eppplugins "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+
+ // Dynamo plugins
+ dynprereq "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid"
+ dynscorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/dynamo_kv_scorer"
)
func main() {
@@ -30,6 +35,9 @@ func main() {
// For adding out-of-tree plugins to the plugins registry, use the following:
// plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function)
+ eppplugins.Register("dynamo-inject-workerid", dynprereq.InjectWorkerIDPreRequestFactory)
+ eppplugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
+
if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
os.Exit(1)
}
diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go
index 32fffc0..1aa1b85 100644
--- a/pkg/bbr/handlers/request.go
+++ b/pkg/bbr/handlers/request.go
@@ -18,8 +18,10 @@ package handlers
import (
"context"
+ "encoding/base64"
"encoding/json"
"fmt"
+ "strings"
basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
@@ -31,11 +33,49 @@ import (
const modelHeader = "X-Gateway-Model-Name"
+// Dynamo-related
+const (
+ workerIDHeader = "x-worker-instance-id"
+ injectHintHeader = "x-epp-inject-nvext-worker-instance-id"
+ tokenDataHeader = "x-epp-inject-nvext-token-data"
+)
+
// HandleRequestBody handles request bodies.
func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]*eppb.ProcessingResponse, error) {
logger := log.FromContext(ctx)
var ret []*eppb.ProcessingResponse
+ // If we captured a worker id hint in the headers phase, inject it into body JSON:
+ // nvext.backend_instance_id = <workerID>
+ if wid := strings.TrimSpace(s.workerIDHint); wid != "" {
+ // ensure nvext is a map[string]any
+ if nv, ok := data["nvext"]; !ok || nv == nil {
+ data["nvext"] = map[string]any{"backend_instance_id": wid}
+ } else if m, ok := nv.(map[string]any); ok {
+ m["backend_instance_id"] = wid
+ } else {
+ // if nvext was some other type, replace with a clean map
+ data["nvext"] = map[string]any{"backend_instance_id": wid}
+ }
+ }
+
+ // If we captured token_data in headers, decode and inject as nvext.token_data
+ if td := strings.TrimSpace(s.tokenDataHint); td != "" {
+ // header value is base64(JSON array)
+ if raw, err := base64.StdEncoding.DecodeString(td); err == nil {
+ var arr []int64
+ if err := json.Unmarshal(raw, &arr); err == nil && len(arr) > 0 {
+ // ensure nvext map exists
+ nv, ok := data["nvext"].(map[string]any)
+ if !ok || nv == nil {
+ nv = map[string]any{}
+ data["nvext"] = nv
+ }
+ nv["token_data"] = arr
+ }
+ }
+ }
+
requestBodyBytes, err := json.Marshal(data)
if err != nil {
return nil, err
@@ -46,6 +86,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
metrics.RecordModelNotInBodyCounter()
logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter")
if s.streaming {
+ // still stream the possibly mutated body
ret = append(ret, &eppb.ProcessingResponse{
Response: &eppb.ProcessingResponse_RequestHeaders{
RequestHeaders: &eppb.HeadersResponse{},
@@ -53,14 +94,24 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
})
ret = addStreamedBodyResponse(ret, requestBodyBytes)
return ret, nil
- } else {
- ret = append(ret, &eppb.ProcessingResponse{
+ }
+
+ // non-streaming: return a body response with the (possibly) mutated body
+ return []*eppb.ProcessingResponse{
+ {
Response: &eppb.ProcessingResponse_RequestBody{
- RequestBody: &eppb.BodyResponse{},
+ RequestBody: &eppb.BodyResponse{
+ Response: &eppb.CommonResponse{
+ BodyMutation: &eppb.BodyMutation{
+ Mutation: &eppb.BodyMutation_Body{
+ Body: requestBodyBytes,
+ },
+ },
+ },
+ },
},
- })
- }
- return ret, nil
+ },
+ }, nil
}
modelStr, ok := modelVal.(string)
@@ -73,6 +124,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
metrics.RecordSuccessCounter()
if s.streaming {
+ // set the model header, then stream the (possibly) mutated body
ret = append(ret, &eppb.ProcessingResponse{
Response: &eppb.ProcessingResponse_RequestHeaders{
RequestHeaders: &eppb.HeadersResponse{
@@ -86,16 +138,42 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
RawValue: []byte(modelStr),
},
},
+ // also keep the worker id header if we have one
+ func() *basepb.HeaderValueOption {
+ if strings.TrimSpace(s.workerIDHint) == "" {
+ return nil
+ }
+ return &basepb.HeaderValueOption{
+ Header: &basepb.HeaderValue{
+ Key: workerIDHeader,
+ RawValue: []byte(s.workerIDHint),
+ },
+ }
+ }(),
},
},
},
},
},
})
+
+ // prune nil entries if worker id not present
+ hm := ret[len(ret)-1].GetRequestHeaders().GetResponse().GetHeaderMutation()
+ if hm != nil && hm.SetHeaders != nil {
+ out := hm.SetHeaders[:0]
+ for _, h := range hm.SetHeaders {
+ if h != nil {
+ out = append(out, h)
+ }
+ }
+ hm.SetHeaders = out
+ }
+
ret = addStreamedBodyResponse(ret, requestBodyBytes)
return ret, nil
}
+ // Non-streaming: set model header and replace the body with our mutated JSON
return []*eppb.ProcessingResponse{
{
Response: &eppb.ProcessingResponse_RequestBody{
@@ -111,6 +189,22 @@ func (s *Server) HandleRequestBody(ctx context.Context, data map[string]any) ([]
RawValue: []byte(modelStr),
},
},
+ func() *basepb.HeaderValueOption {
+ if strings.TrimSpace(s.workerIDHint) == "" {
+ return nil
+ }
+ return &basepb.HeaderValueOption{
+ Header: &basepb.HeaderValue{
+ Key: workerIDHeader,
+ RawValue: []byte(s.workerIDHint),
+ },
+ }
+ }(),
+ },
+ },
+ BodyMutation: &eppb.BodyMutation{
+ Mutation: &eppb.BodyMutation_Body{
+ Body: requestBodyBytes,
},
},
},
@@ -141,6 +235,32 @@ func addStreamedBodyResponse(responses []*eppb.ProcessingResponse, requestBodyBy
// HandleRequestHeaders handles request headers.
func (s *Server) HandleRequestHeaders(headers *eppb.HttpHeaders) ([]*eppb.ProcessingResponse, error) {
+ // reset per-request
+ s.workerIDHint = ""
+ s.tokenDataHint = ""
+
+ if m := headers.GetHeaders(); m != nil {
+ for _, h := range m.GetHeaders() {
+ k := strings.ToLower(h.GetKey())
+
+ switch k {
+ case injectHintHeader, workerIDHeader:
+ if rv := h.GetRawValue(); len(rv) > 0 {
+ s.workerIDHint = strings.TrimSpace(string(rv))
+ } else {
+ s.workerIDHint = strings.TrimSpace(h.GetValue())
+ }
+ case tokenDataHeader:
+ if rv := h.GetRawValue(); len(rv) > 0 {
+ s.tokenDataHint = strings.TrimSpace(string(rv))
+ } else {
+ s.tokenDataHint = strings.TrimSpace(h.GetValue())
+ }
+ }
+ }
+ }
+
+ // No header mutations needed here; body phase will do the JSON injection.
return []*eppb.ProcessingResponse{
{
Response: &eppb.ProcessingResponse_RequestHeaders{
diff --git a/pkg/bbr/handlers/server.go b/pkg/bbr/handlers/server.go
index a580380..eb2893f 100644
--- a/pkg/bbr/handlers/server.go
+++ b/pkg/bbr/handlers/server.go
@@ -38,7 +38,9 @@ func NewServer(streaming bool) *Server {
// Server implements the Envoy external processing server.
// https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto
type Server struct {
- streaming bool
+ streaming bool
+ workerIDHint string
+ tokenDataHint string
}
func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
diff --git a/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
new file mode 100644
index 0000000..b6708fa
--- /dev/null
+++ b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
@@ -0,0 +1,69 @@
+package dynamo_inject_workerid
+
+import (
+ "context"
+ "encoding/json"
+ "strings"
+
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+ rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
+ schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+const (
+ typeString = "dynamo-inject-workerid"
+ pluginName = "dynamo-inject-workerid"
+ WorkerIDHeader = "x-worker-instance-id"
+ injectHintHeader = "x-epp-inject-nvext-worker-instance-id"
+ TokenDataHeader = "x-epp-inject-nvext-token-data"
+)
+
+var _ plugins.Plugin = (*InjectWorkerIDPreRequest)(nil)
+var _ rc.PreRequest = (*InjectWorkerIDPreRequest)(nil)
+
+type InjectWorkerIDPreRequest struct {
+ typedName plugins.TypedName
+}
+
+func NewInjectWorkerIDPreRequest() *InjectWorkerIDPreRequest {
+ return &InjectWorkerIDPreRequest{
+ typedName: plugins.TypedName{Type: typeString, Name: pluginName},
+ }
+}
+
+func (p *InjectWorkerIDPreRequest) WithName(name string) *InjectWorkerIDPreRequest {
+ p.typedName.Name = name
+ return p
+}
+
+func InjectWorkerIDPreRequestFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+ return NewInjectWorkerIDPreRequest().WithName(name), nil
+}
+
+func (p *InjectWorkerIDPreRequest) TypedName() plugins.TypedName { return p.typedName }
+
+func (p *InjectWorkerIDPreRequest) PreRequest(
+ _ context.Context,
+ req *schedtypes.LLMRequest,
+ _ *schedtypes.SchedulingResult,
+ _ int,
+) {
+ if req == nil {
+ return
+ }
+ if req.Headers == nil {
+ req.Headers = map[string]string{}
+ }
+ wid := strings.TrimSpace(req.Headers[WorkerIDHeader])
+ if wid == "" {
+ return
+ }
+ req.Headers[WorkerIDHeader] = wid
+ req.Headers[injectHintHeader] = wid
+
+ // Pass through token-data header if scorer set it
+ if td := strings.TrimSpace(req.Headers[TokenDataHeader]); td != "" {
+ req.Headers[TokenDataHeader] = td
+ }
+
+}
diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
new file mode 100644
index 0000000..b689c00
--- /dev/null
+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
@@ -0,0 +1,21 @@
+# This is an example for configuring the EPP to use the dynamo token-aware kv router for scoring the pods
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+ # Required: tells EPP which profile to use (even if you only have one)
+ - type: single-profile-handler
+
+ # Picker: chooses the final endpoint after scoring
+ - name: picker
+ type: max-score-picker
+ - name: dyn-pre
+ type: dynamo-inject-workerid
+ parameters: {}
+ - name: dyn-kv
+ type: kv-aware-scorer
+schedulingProfiles:
+ - name: default
+ plugins:
+ - pluginRef: dyn-kv
+ weight: 1
+ - pluginRef: picker
diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
new file mode 100644
index 0000000..83a4ace
--- /dev/null
+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
@@ -0,0 +1,428 @@
+package dynamo_kv_scorer
+
+/*
+#cgo CPPFLAGS: -I${SRCDIR}/include
+#cgo CXXFLAGS: -std=c++17
+#cgo LDFLAGS: ${SRCDIR}/lib/libdynamo_llm_capi.a -lstdc++ -ldl -lpthread -lm
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h> // for free
+#include <stdbool.h>
+
+// enum underlying type is uint32_t; matches cbindgen output
+typedef uint32_t dynamo_llm_result_t;
+enum { DYNAMO_OK = 0, DYNAMO_ERR = 1 };
+
+// opaque handle forward-decl
+struct WorkerSelectionPipeline;
+typedef struct WorkerSelectionPipeline WorkerSelectionPipeline;
+
+// Prototypes (C-compatible)
+dynamo_llm_result_t dynamo_llm_init(const char *namespace_c_str,
+ const char *component_c_str,
+ int64_t worker_id,
+ uint32_t kv_block_size);
+
+dynamo_llm_result_t dynamo_llm_shutdown(void);
+dynamo_llm_result_t dynamo_llm_load_publisher_create(void);
+
+dynamo_llm_result_t dynamo_kv_event_publish_stored(uint64_t event_id,
+ const uint32_t *token_ids,
+ const uintptr_t *num_block_tokens,
+ const uint64_t *block_ids,
+ size_t num_blocks,
+ const uint64_t *parent_hash,
+ uint64_t lora_id);
+
+dynamo_llm_result_t dynamo_kv_event_publish_removed(uint64_t event_id,
+ const uint64_t *block_ids,
+ size_t num_blocks);
+
+dynamo_llm_result_t dynamo_create_worker_selection_pipeline(const char *namespace_c_str,
+ const char *component_c_str,
+ const char *model_name_c_str,
+ bool use_kv_routing,
+ double busy_threshold,
+ double overlap_score_weight,
+ double router_temperature,
+ bool use_kv_events,
+ bool router_replica_sync,
+ WorkerSelectionPipeline **pipeline_out);
+
+dynamo_llm_result_t dynamo_destroy_worker_selection_pipeline(WorkerSelectionPipeline *pipeline);
+
+dynamo_llm_result_t dynamo_query_worker_selection_and_annotate(WorkerSelectionPipeline *pipeline,
+ const char *request_json_c_str,
+ int64_t *worker_instance_id_out,
+ uint32_t **token_ids_out,
+ size_t *token_count_out,
+ char **annotated_request_json_out);
+
+dynamo_llm_result_t dynamo_free_worker_selection_result(uint32_t *token_ids,
+ size_t token_count,
+ char *annotated_request_json);
+*/
+import "C"
+
+import (
+ "context"
+ "encoding/base64"
+ "encoding/json"
+ "fmt"
+ "os"
+ "strings"
+ "sync"
+ "unsafe"
+
+ log "sigs.k8s.io/controller-runtime/pkg/log"
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
+ schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+ logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+const (
+ PluginName = "dynamo-kv-scorer"
+ KVAwareScorerType = "kv-aware-scorer"
+ StateKeyWorkerInstanceID = schedtypes.StateKey("dynamo/worker-instance-id")
+ WorkerIDHeader = "x-worker-instance-id"
+ TokenDataHeader = "x-epp-inject-nvext-token-data"
+)
+
+// --------------------------- config / env ---------------------------
+
+var warmupOnce sync.Once
+var warmupErr error
+
+type stateString string
+type params struct {
+}
+
+func (s stateString) Clone() schedtypes.StateData { return s }
+
+type KVAwareScorer struct {
+ typedName plugins.TypedName
+}
+
+var _ plugins.Plugin = (*KVAwareScorer)(nil)
+var _ framework.Scorer = (*KVAwareScorer)(nil)
+
+func NewKVAwareScorer() *KVAwareScorer {
+ return &KVAwareScorer{
+ typedName: plugins.TypedName{Type: KVAwareScorerType, Name: PluginName},
+ }
+}
+
+func (k *KVAwareScorer) WithName(name string) *KVAwareScorer { k.typedName.Name = name; return k }
+
+func KVAwareScorerFactory(name string, raw json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+ p := params{}
+ _ = json.Unmarshal(raw, &p)
+
+ s := NewKVAwareScorer().WithName(name)
+
+ // one-time FFI init (runtime + persistent pipeline)
+ warmupOnce.Do(func() {
+ defer func() {
+ if r := recover(); r != nil {
+ warmupErr = fmt.Errorf("Dynamo configuration error: %v", r)
+ }
+ }()
+ warmupErr = initFFI()
+ })
+ if warmupErr != nil {
+ return nil, fmt.Errorf("Dynamo FFI init for the Router failed: %w", warmupErr)
+ }
+
+ return s, nil
+}
+
+func (k *KVAwareScorer) TypedName() plugins.TypedName { return k.typedName }
+
+// --------------------------- FFI integration ---------------------------
+
+var (
+ ffiOnce sync.Once
+ ffiErr error
+
+ ffiNamespace string
+ ffiComponent string
+ ffiModel string
+ ffiOverlapScoreWeight float64
+ ffiRouterTemperature float64
+ ffiKvBlockSize uint32
+ ffiWorkerID int64
+
+ runtimeInitialized bool
+
+ // Boxed pipeline handle (owned on the Rust side, opaque here)
+ pipeline *C.struct_WorkerSelectionPipeline
+ pipelineMutex sync.RWMutex
+)
+
+func loadDynamoConfig() {
+ ffiNamespace = getEnvOrDefault("DYNAMO_NAMESPACE", "vllm-agg")
+ ffiComponent = getEnvOrDefault("DYNAMO_COMPONENT", "backend")
+ ffiModel = getEnvOrDefault("DYNAMO_MODEL", "Qwen/Qwen3-0.6B")
+ ffiWorkerID = getEnvInt64OrDefault("DYNAMO_WORKER_ID", 1)
+
+ ffiOverlapScoreWeight = getEnvFloatOrDefault("DYNAMO_OVERLAP_SCORE_WEIGHT", -1.0)
+ ffiRouterTemperature = getEnvFloatOrDefault("DYNAMO_ROUTER_TEMPERATURE", -1.0)
+
+ kvBlockSizeStr := os.Getenv("DYNAMO_KV_BLOCK_SIZE")
+ if kvBlockSizeStr == "" {
+ panic("DYNAMO_KV_BLOCK_SIZE is required and must match the model card's kv_cache_block_size")
+ }
+ var tmp int64
+ if n, err := fmt.Sscanf(kvBlockSizeStr, "%d", &tmp); err != nil || n != 1 {
+ panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE='%s' is not a valid integer", kvBlockSizeStr))
+ }
+ ffiKvBlockSize = uint32(tmp)
+ if ffiKvBlockSize < 16 || ffiKvBlockSize > 8192 {
+ panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d outside [16,8192]", ffiKvBlockSize))
+ }
+ if (ffiKvBlockSize & (ffiKvBlockSize - 1)) != 0 {
+ panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d must be a power of 2", ffiKvBlockSize))
+ }
+ fmt.Printf("Dynamo KV Scorer: Loaded DYNAMO_KV_BLOCK_SIZE=%d\n", ffiKvBlockSize)
+}
+
+func getEnvOrDefault(key, def string) string {
+ if v := os.Getenv(key); v != "" {
+ return v
+ }
+ return def
+}
+func getEnvInt64OrDefault(key string, def int64) int64 {
+ if v := os.Getenv(key); v != "" {
+ var p int64
+ if n, err := fmt.Sscanf(v, "%d", &p); err == nil && n == 1 {
+ return p
+ }
+ }
+ return def
+}
+func getEnvFloatOrDefault(key string, def float64) float64 {
+ if v := os.Getenv(key); v != "" {
+ var p float64
+ if n, err := fmt.Sscanf(v, "%f", &p); err == nil && n == 1 {
+ return p
+ }
+ }
+ return def
+}
+func getEnvBoolOrDefault(key string, def bool) bool {
+ if v := os.Getenv(key); v != "" {
+ switch strings.ToLower(v) {
+ case "true", "1", "yes", "on":
+ return true
+ case "false", "0", "no", "off":
+ return false
+ }
+ }
+ return def
+}
+
+// initFFI: initialize runtime and create a persistent boxed pipeline.
+func initFFI() error {
+ ffiOnce.Do(func() {
+ loadDynamoConfig()
+
+ ns := C.CString(ffiNamespace)
+ cm := C.CString(ffiComponent)
+ model := C.CString(ffiModel)
+ defer C.free(unsafe.Pointer(ns))
+ defer C.free(unsafe.Pointer(cm))
+ defer C.free(unsafe.Pointer(model))
+
+ // Init Dynamo runtime
+ if rc := C.dynamo_llm_init(ns, cm, C.int64_t(ffiWorkerID), C.uint32_t(ffiKvBlockSize)); rc != C.DYNAMO_OK {
+ ffiErr = fmt.Errorf("dynamo_llm_init failed")
+ return
+ }
+ runtimeInitialized = true
+
+ // Create persistent pipeline
+ pipelineMutex.Lock()
+ defer pipelineMutex.Unlock()
+
+ rc := C.dynamo_create_worker_selection_pipeline(
+ ns,
+ cm,
+ model,
+ C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_ROUTING", true)),
+ C.double(getEnvFloatOrDefault("DYNAMO_BUSY_THRESHOLD", -1.0)),
+ C.double(ffiOverlapScoreWeight),
+ C.double(ffiRouterTemperature),
+ C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_EVENTS", true)),
+ C.bool(getEnvBoolOrDefault("DYNAMO_ROUTER_REPLICA_SYNC", true)),
+ &pipeline,
+ )
+ if rc != C.DYNAMO_OK {
+ ffiErr = fmt.Errorf("dynamo_create_worker_selection_pipeline failed")
+ return
+ }
+ })
+ return ffiErr
+}
+
+// --------------------------- scoring ---------------------------
+
+func encodeTokenData(tokens []int64) string {
+ b, _ := json.Marshal(tokens)
+ return base64.StdEncoding.EncodeToString(b)
+}
+
+func (k *KVAwareScorer) Score(
+ ctx context.Context,
+ cycle *schedtypes.CycleState,
+ req *schedtypes.LLMRequest,
+ pods []schedtypes.Pod,
+) map[schedtypes.Pod]float64 {
+ logger := log.FromContext(ctx)
+
+ workerID, tokenData, err := k.callDynamoRouter(ctx, req)
+ if err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "Dynamo call failed; proceeding without worker id")
+ } else if workerID != "" {
+ logger.V(logutil.DEFAULT).Info(
+ "Dynamo router selected worker",
+ "workerID", workerID,
+ "tokenDataCount", len(tokenData),
+ "tokenData", tokenData,
+ )
+ cycle.Write(StateKeyWorkerInstanceID, stateString(workerID))
+ if req.Headers == nil {
+ req.Headers = map[string]string{}
+ }
+ req.Headers[WorkerIDHeader] = workerID
+ if len(tokenData) > 0 {
+ if req.Headers == nil {
+ req.Headers = map[string]string{}
+ }
+ req.Headers[TokenDataHeader] = encodeTokenData(tokenData)
+ }
+ }
+
+ out := make(map[schedtypes.Pod]float64, len(pods))
+ for _, p := range pods {
+ out[p] = 1.0
+ }
+ return out
+}
+
+// --------------------------- router call (persistent only) ---------------------------
+
+func (k *KVAwareScorer) callDynamoRouter(
+ ctx context.Context,
+ req *schedtypes.LLMRequest,
+) (string, []int64, error) {
+ logger := log.FromContext(ctx)
+
+ if err := initFFI(); err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "FFI init failed")
+ return "", nil, err
+ }
+ if !runtimeInitialized {
+ return "", nil, fmt.Errorf("dynamo runtime not initialized")
+ }
+
+ pipelineMutex.RLock()
+ currentPipeline := pipeline
+ pipelineMutex.RUnlock()
+
+ if currentPipeline == nil {
+ return "", nil, fmt.Errorf("dynamo worker selection pipeline not created")
+ }
+
+ // Build OpenAI-compatible JSON request
+ requestBody := buildOpenAIRequest(req)
+ requestJSON, err := json.Marshal(requestBody)
+ if err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "Failed to marshal OpenAI request")
+ return "", nil, fmt.Errorf("marshal OpenAI request: %w", err)
+ }
+ cRequestJSON := C.CString(string(requestJSON))
+ defer C.free(unsafe.Pointer(cRequestJSON))
+
+ // Output variables
+ var cWorkerID C.int64_t
+ var cTokens *C.uint32_t
+ var cTokenCount C.size_t
+ var cAnnotatedJSON *C.char
+
+ // Call the worker selection pipeline
+ rc := C.dynamo_query_worker_selection_and_annotate(
+ currentPipeline,
+ cRequestJSON,
+ &cWorkerID,
+ &cTokens,
+ &cTokenCount,
+ &cAnnotatedJSON,
+ )
+ if rc != C.DYNAMO_OK {
+ return "", nil, fmt.Errorf("dynamo_query_worker_selection_and_annotate failed")
+ }
+
+ // Copy tokens into Go memory and free C memory
+ count := int(uintptr(cTokenCount))
+ var tokens64 []int64
+ if count > 0 && cTokens != nil {
+ src := unsafe.Slice((*uint32)(unsafe.Pointer(cTokens)), count)
+ tokens64 = make([]int64, count)
+ for i := 0; i < count; i++ {
+ tokens64[i] = int64(src[i])
+ }
+ }
+ C.dynamo_free_worker_selection_result(cTokens, cTokenCount, cAnnotatedJSON)
+
+ workerID := fmt.Sprintf("%d", int64(cWorkerID))
+ logger.V(logutil.DEFAULT).Info("Worker selection completed",
+ "workerID", workerID, "tokenCount", count)
+
+ return workerID, tokens64, nil
+}
+
+func buildOpenAIRequest(req *schedtypes.LLMRequest) map[string]any {
+ requestBody := make(map[string]any)
+ userText := "default prompt"
+ if req != nil && strings.TrimSpace(req.Prompt) != "" {
+ userText = req.Prompt
+ }
+ requestBody["messages"] = []map[string]any{{"role": "user", "content": userText}}
+ if req != nil && strings.TrimSpace(req.TargetModel) != "" {
+ requestBody["model"] = req.TargetModel
+ } else {
+ requestBody["model"] = ffiModel
+ }
+ requestBody["max_tokens"] = 1
+ requestBody["temperature"] = 0.0
+ requestBody["stream"] = true
+ requestBody["nvext"] = map[string]any{
+ "annotations": []string{"query_instance_id"},
+ }
+ return requestBody
+}
+
+// --------------------------- shutdown ---------------------------
+
+func cleanupDynamo() error {
+ pipelineMutex.Lock()
+ defer pipelineMutex.Unlock()
+
+ if pipeline != nil {
+ if rc := C.dynamo_destroy_worker_selection_pipeline(pipeline); rc != C.DYNAMO_OK {
+ fmt.Printf("Warning: dynamo_destroy_worker_selection_pipeline failed\n")
+ }
+ pipeline = nil
+ }
+
+ if runtimeInitialized {
+ if rc := C.dynamo_llm_shutdown(); rc != C.DYNAMO_OK {
+ return fmt.Errorf("dynamo_llm_shutdown failed")
+ }
+ runtimeInitialized = false
+ }
+ return nil
+}
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index fb73765..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,33 +0,0 @@
-# Dockerfile has specific requirement to put this ARG at the beginning:
-# https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
-ARG BUILDER_IMAGE=golang:1.24
-ARG BASE_IMAGE=gcr.io/distroless/static:nonroot
-
-## Multistage build
-FROM ${BUILDER_IMAGE} AS builder
-ENV CGO_ENABLED=0
-ENV GOOS=linux
-ENV GOARCH=amd64
-ARG COMMIT_SHA=unknown
-ARG BUILD_REF
-
-# Dependencies
-WORKDIR /src
-COPY go.mod go.sum ./
-RUN go mod download
-
-# Sources
-COPY cmd/epp ./cmd/epp
-COPY pkg/epp ./pkg/epp
-COPY internal ./internal
-COPY api ./api
-WORKDIR /src/cmd/epp
-RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" -o /epp
-
-## Multistage deploy
-FROM ${BASE_IMAGE}
-
-WORKDIR /
-COPY --from=builder /epp /epp
-
-ENTRYPOINT ["/epp"]
diff --git a/Makefile b/Makefile
index dee7e99..d3f9ec7 100644
--- a/Makefile
+++ b/Makefile
@@ -170,6 +170,49 @@ verify-all:
##@ Build
+##@ Dynamo EPP with FFI
+
+# Build the Dynamo EPP image with CGO static library support
+.PHONY: dynamo-image-local-build
+dynamo-image-local-build: ## Build the Dynamo EPP image using Docker Buildx for local development.
+ BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use)
+ $(MAKE) dynamo-image-build PUSH=$(PUSH)
+ $(MAKE) dynamo-image-build LOAD=$(LOAD)
+ $(DOCKER_BUILDX_CMD) rm $$BUILDER
+
+.PHONY: dynamo-image-local-push
+dynamo-image-local-push: PUSH=--push ## Build the Dynamo EPP image for local development and push it to $IMAGE_REPO.
+dynamo-image-local-push: dynamo-image-local-build
+
+.PHONY: dynamo-image-local-load
+dynamo-image-local-load: LOAD=--load ## Build the Dynamo EPP image for local development and load it in the local Docker registry.
+dynamo-image-local-load: dynamo-image-local-build
+
+.PHONY: dynamo-image-build
+dynamo-image-build: ## Build the Dynamo EPP image using Docker Buildx with CGO support.
+ $(IMAGE_BUILD_CMD) -f Dockerfile.dynamo -t $(IMAGE_TAG) \
+ --platform=$(PLATFORMS) \
+ --build-arg DOCKER_PROXY=$(DOCKER_PROXY) \
+ --build-arg BASE_IMAGE=ubuntu:24.04 \
+ --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
+ --build-arg COMMIT_SHA=${GIT_COMMIT_SHA} \
+ --build-arg BUILD_REF=${BUILD_REF} \
+ $(PUSH) \
+ $(LOAD) \
+ $(IMAGE_BUILD_EXTRA_OPTS) ./
+
+.PHONY: dynamo-image-push
+dynamo-image-push: PUSH=--push ## Build the Dynamo EPP image and push it to $IMAGE_REPO.
+dynamo-image-push: dynamo-image-build
+
+.PHONY: dynamo-image-load
+dynamo-image-load: LOAD=--load ## Build the Dynamo EPP image and load it in the local Docker registry.
+dynamo-image-load: dynamo-image-build
+
+.PHONY: dynamo-image-kind
+dynamo-image-kind: dynamo-image-build ## Build the Dynamo EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default).
+ kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
+
# Build the container image
.PHONY: image-local-build
image-local-build: ## Build the EPP image using Docker Buildx for local development.
diff --git a/cmd/epp/main.go b/cmd/epp/main.go
index b5e0617..b5c0312 100644
--- a/cmd/epp/main.go
+++ b/cmd/epp/main.go
@@ -22,6 +22,12 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
+ eppplugins "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+
+ // Dynamo plugins
+ dyncleanup "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/dynamo_cleanup"
+ dynprereq "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid"
+ dynscorer "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/dynamo_kv_scorer"
)
func main() {
@@ -30,6 +36,10 @@ func main() {
// For adding out-of-tree plugins to the plugins registry, use the following:
// plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function)
+ eppplugins.Register("dynamo-inject-workerid", dynprereq.InjectWorkerIDPreRequestFactory)
+ eppplugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
+ eppplugins.Register("dynamo-cleanup", dyncleanup.DynamoCleanupPluginFactory)
+
if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
os.Exit(1)
}
diff --git a/pkg/epp/requestcontrol/body_mutator.go b/pkg/epp/requestcontrol/body_mutator.go
new file mode 100644
index 0000000..de87445
--- /dev/null
+++ b/pkg/epp/requestcontrol/body_mutator.go
@@ -0,0 +1,19 @@
+package requestcontrol
+
+import (
+ "context"
+
+ schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+// RequestBodyMutator allows pre-request plugins to mutate the outbound request body.
+// Implementations are invoked after the standard PreRequest hook completes.
+type RequestBodyMutator interface {
+ MutateRequestBody(
+ ctx context.Context,
+ request *schedtypes.LLMRequest,
+ schedulingResult *schedtypes.SchedulingResult,
+ targetPort int,
+ body map[string]any,
+ )
+}
diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go
index 670d922..0cf04cb 100644
--- a/pkg/epp/requestcontrol/director.go
+++ b/pkg/epp/requestcontrol/director.go
@@ -130,6 +130,7 @@ func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestCo
TargetModel: reqCtx.ResolvedTargetModel,
Prompt: prompt,
Headers: reqCtx.Request.Headers,
+ Annotations: map[string]any{},
}
logger = logger.WithValues("model", reqCtx.Model, "resolvedTargetModel", reqCtx.ResolvedTargetModel, "criticality", requestCriticality)
@@ -253,7 +254,7 @@ func (d *Director) prepareRequest(ctx context.Context, reqCtx *handlers.RequestC
reqCtx.TargetPod = targetPod
reqCtx.TargetEndpoint = endpoint
- d.runPreRequestPlugins(ctx, reqCtx.SchedulingRequest, result, targetPort)
+ d.runPreRequestPlugins(ctx, reqCtx.SchedulingRequest, result, targetPort, reqCtx.Request.Body)
return reqCtx, nil
}
@@ -319,13 +320,20 @@ func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed
return ""
}
-func (d *Director) runPreRequestPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, schedulingResult *schedulingtypes.SchedulingResult,
+func (d *Director) runPreRequestPlugins(
+ ctx context.Context,
+ request *schedulingtypes.LLMRequest,
+ schedulingResult *schedulingtypes.SchedulingResult,
targetPort int,
+ body map[string]any,
) {
for _, plugin := range d.preRequestPlugins {
log.FromContext(ctx).V(logutil.DEBUG).Info("Running pre-request plugin", "plugin", plugin.TypedName().Type)
before := time.Now()
plugin.PreRequest(ctx, request, schedulingResult, targetPort)
+ if mutator, ok := plugin.(RequestBodyMutator); ok && body != nil {
+ mutator.MutateRequestBody(ctx, request, schedulingResult, targetPort, body)
+ }
metrics.RecordRequestControlPluginProcessingLatency(PreRequestPluginType, plugin.TypedName().Type, time.Since(before))
}
}
diff --git a/pkg/epp/requestcontrol/plugins/dynamo_cleanup/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_cleanup/plugin.go
new file mode 100644
index 0000000..a389372
--- /dev/null
+++ b/pkg/epp/requestcontrol/plugins/dynamo_cleanup/plugin.go
@@ -0,0 +1,86 @@
+package dynamo_cleanup
+
+import (
+ "context"
+ "encoding/json"
+
+ log "sigs.k8s.io/controller-runtime/pkg/log"
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+ rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
+ schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+ logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+
+ dynamo "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/dynamo_kv_scorer"
+)
+
+const (
+ PluginName = "dynamo-cleanup"
+ PluginType = "dynamo-cleanup"
+)
+
+// DynamoCleanupPlugin is a PostResponse plugin that cleans up router state
+// when a request completes. It calls dynamo_router_free_request to release
+// the bookkeeping resources associated with the request.
+type DynamoCleanupPlugin struct {
+ typedName plugins.TypedName
+}
+
+var _ plugins.Plugin = (*DynamoCleanupPlugin)(nil)
+var _ rc.PostResponse = (*DynamoCleanupPlugin)(nil)
+
+// NewDynamoCleanupPlugin creates a new DynamoCleanupPlugin instance.
+func NewDynamoCleanupPlugin() *DynamoCleanupPlugin {
+ return &DynamoCleanupPlugin{
+ typedName: plugins.TypedName{Type: PluginType, Name: PluginName},
+ }
+}
+
+// WithName sets a custom name for the plugin.
+func (p *DynamoCleanupPlugin) WithName(name string) *DynamoCleanupPlugin {
+ p.typedName.Name = name
+ return p
+}
+
+// DynamoCleanupPluginFactory creates a DynamoCleanupPlugin from configuration.
+func DynamoCleanupPluginFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+ return NewDynamoCleanupPlugin().WithName(name), nil
+}
+
+// TypedName returns the plugin's type and name.
+func (p *DynamoCleanupPlugin) TypedName() plugins.TypedName {
+ return p.typedName
+}
+
+// PostResponse is called after a response is received from the model server.
+// It cleans up the router bookkeeping state for the completed request.
+func (p *DynamoCleanupPlugin) PostResponse(
+ ctx context.Context,
+ request *schedtypes.LLMRequest,
+ response *rc.Response,
+ targetPod *backend.Pod,
+) {
+ logger := log.FromContext(ctx)
+
+ if request == nil {
+ logger.V(logutil.DEBUG).Info("DynamoCleanupPlugin: request is nil, skipping cleanup")
+ return
+ }
+
+ requestID := request.RequestId
+ if requestID == "" {
+ logger.V(logutil.DEBUG).Info("DynamoCleanupPlugin: no request ID, skipping cleanup")
+ return
+ }
+
+ // Call the dynamo router to free the request bookkeeping
+ if err := dynamo.CallFreeRequest(requestID); err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "DynamoCleanupPlugin: failed to free request",
+ "requestID", requestID)
+ return
+ }
+
+ logger.V(logutil.VERBOSE).Info("DynamoCleanupPlugin: freed request from router",
+ "requestID", requestID)
+}
+
diff --git a/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
new file mode 100644
index 0000000..1c8f979
--- /dev/null
+++ b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
@@ -0,0 +1,171 @@
+package dynamo_inject_workerid
+
+import (
+ "context"
+ "encoding/json"
+ "strconv"
+ "strings"
+
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+ rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
+ schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+)
+
+const (
+ typeString = "dynamo-inject-workerid"
+ pluginName = "dynamo-inject-workerid"
+ WorkerIDHeader = "x-worker-instance-id"
+ PrefillWorkerIDHeader = "x-prefiller-host-port"
+ tokenDataAnnotationKey = "dynamo/token-data"
+)
+
+var _ plugins.Plugin = (*InjectWorkerIDPreRequest)(nil)
+var _ rc.PreRequest = (*InjectWorkerIDPreRequest)(nil)
+var _ rc.RequestBodyMutator = (*InjectWorkerIDPreRequest)(nil)
+
+type InjectWorkerIDPreRequest struct {
+ typedName plugins.TypedName
+}
+
+func NewInjectWorkerIDPreRequest() *InjectWorkerIDPreRequest {
+ return &InjectWorkerIDPreRequest{
+ typedName: plugins.TypedName{Type: typeString, Name: pluginName},
+ }
+}
+
+func (p *InjectWorkerIDPreRequest) WithName(name string) *InjectWorkerIDPreRequest {
+ p.typedName.Name = name
+ return p
+}
+
+func InjectWorkerIDPreRequestFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+ return NewInjectWorkerIDPreRequest().WithName(name), nil
+}
+
+func (p *InjectWorkerIDPreRequest) TypedName() plugins.TypedName { return p.typedName }
+
+func (p *InjectWorkerIDPreRequest) PreRequest(
+ _ context.Context,
+ req *schedtypes.LLMRequest,
+ _ *schedtypes.SchedulingResult,
+ _ int,
+) {
+ if req == nil {
+ return
+ }
+ if req.Headers == nil {
+ req.Headers = map[string]string{}
+ }
+
+ // Handle worker instance ID
+ wid := strings.TrimSpace(req.Headers[WorkerIDHeader])
+ if wid != "" {
+ req.Headers[WorkerIDHeader] = wid
+ }
+
+ // Handle prefill worker ID
+ prefillWid := strings.TrimSpace(req.Headers[PrefillWorkerIDHeader])
+ if prefillWid != "" {
+ req.Headers[PrefillWorkerIDHeader] = prefillWid
+ }
+}
+
+func (p *InjectWorkerIDPreRequest) MutateRequestBody(
+ _ context.Context,
+ req *schedtypes.LLMRequest,
+ _ *schedtypes.SchedulingResult,
+ _ int,
+ body map[string]any,
+) {
+ if req == nil || body == nil {
+ return
+ }
+ if req.Headers == nil {
+ return
+ }
+
+ wid := strings.TrimSpace(req.Headers[WorkerIDHeader])
+ if wid == "" {
+ return
+ }
+
+ prefillWid := strings.TrimSpace(req.Headers[PrefillWorkerIDHeader])
+
+ nvext, _ := body["nvext"].(map[string]any)
+ if nvext == nil {
+ nvext = map[string]any{}
+ body["nvext"] = nvext
+ }
+
+ if prefillWid != "" && prefillWid != wid {
+ // Disaggregated mode: use prefill_worker_id and decode_worker_id
+ if prefillWidUint, err := strconv.ParseUint(prefillWid, 10, 64); err == nil {
+ nvext["prefill_worker_id"] = prefillWidUint
+ }
+ if widUint, err := strconv.ParseUint(wid, 10, 64); err == nil {
+ nvext["decode_worker_id"] = widUint
+ }
+ } else {
+ // Aggregated mode (empty prefill or prefill == decode): use backend_instance_id
+ if widUint, err := strconv.ParseUint(wid, 10, 64); err == nil {
+ nvext["backend_instance_id"] = widUint
+ }
+ }
+
+ if tokens, ok := req.Annotations[tokenDataAnnotationKey]; ok {
+ switch v := tokens.(type) {
+ case []int64:
+ if len(v) > 0 {
+ nvext["token_data"] = v
+ }
+ case []any:
+ var out []int64
+ for _, elem := range v {
+ switch t := elem.(type) {
+ case int64:
+ out = append(out, t)
+ case float64:
+ out = append(out, int64(t))
+ }
+ }
+ if len(out) > 0 {
+ nvext["token_data"] = out
+ }
+ case json.RawMessage:
+ var out []int64
+ if err := json.Unmarshal(v, &out); err == nil && len(out) > 0 {
+ nvext["token_data"] = out
+ }
+ }
+ }
+
+ // Remove query_instance_id from nvext.annotations if present
+ if annotations, ok := nvext["annotations"]; ok {
+ switch annList := annotations.(type) {
+ case []string:
+ filtered := make([]string, 0, len(annList))
+ for _, ann := range annList {
+ if ann != "query_instance_id" {
+ filtered = append(filtered, ann)
+ }
+ }
+ if len(filtered) == 0 {
+ delete(nvext, "annotations")
+ } else {
+ nvext["annotations"] = filtered
+ }
+ case []any:
+ filtered := make([]any, 0, len(annList))
+ for _, ann := range annList {
+ if str, ok := ann.(string); !ok || str != "query_instance_id" {
+ filtered = append(filtered, ann)
+ }
+ }
+ if len(filtered) == 0 {
+ delete(nvext, "annotations")
+ } else {
+ nvext["annotations"] = filtered
+ }
+ }
+ }
+}
diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
new file mode 100644
index 0000000..e94b72b
--- /dev/null
+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
@@ -0,0 +1,24 @@
+# This is an example for configuring the EPP to use the dynamo token-aware kv router for scoring the pods
+apiVersion: inference.networking.x-k8s.io/v1alpha1
+kind: EndpointPickerConfig
+plugins:
+ # Required: tells EPP which profile to use (even if you only have one)
+ - type: single-profile-handler
+
+ # Picker: chooses the final endpoint after scoring
+ - name: picker
+ type: max-score-picker
+ - name: dyn-pre
+ type: dynamo-inject-workerid
+ parameters: {}
+ - name: dyn-kv
+ type: kv-aware-scorer
+ # Cleanup: frees router bookkeeping when request completes
+ - name: dyn-cleanup
+ type: dynamo-cleanup
+schedulingProfiles:
+ - name: default
+ plugins:
+ - pluginRef: dyn-kv
+ weight: 1
+ - pluginRef: picker
diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
new file mode 100644
index 0000000..31af16e
--- /dev/null
+++ b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go
@@ -0,0 +1,587 @@
+package dynamo_kv_scorer
+
+/*
+#cgo CPPFLAGS: -I${SRCDIR}/include
+#cgo CXXFLAGS: -std=c++17
+#cgo LDFLAGS: ${SRCDIR}/lib/libdynamo_llm_capi.a -lstdc++ -ldl -lpthread -lm
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h> // for free
+#include <stdbool.h>
+
+// enum underlying type is uint32_t; matches cbindgen output
+typedef uint32_t dynamo_llm_result_t;
+enum { DYNAMO_OK = 0, DYNAMO_ERR = 1 };
+
+// opaque handle forward-decl
+struct WorkerSelectionPipeline;
+typedef struct WorkerSelectionPipeline WorkerSelectionPipeline;
+
+// Prototypes (C-compatible)
+dynamo_llm_result_t dynamo_llm_init(const char *namespace_c_str,
+ const char *component_c_str,
+ int64_t worker_id,
+ uint32_t kv_block_size);
+
+dynamo_llm_result_t dynamo_llm_shutdown(void);
+dynamo_llm_result_t dynamo_llm_load_publisher_create(void);
+
+dynamo_llm_result_t dynamo_kv_event_publish_stored(uint64_t event_id,
+ const uint32_t *token_ids,
+ const uintptr_t *num_block_tokens,
+ const uint64_t *block_ids,
+ size_t num_blocks,
+ const uint64_t *parent_hash,
+ uint64_t lora_id);
+
+dynamo_llm_result_t dynamo_kv_event_publish_removed(uint64_t event_id,
+ const uint64_t *block_ids,
+ size_t num_blocks);
+
+dynamo_llm_result_t dynamo_create_worker_selection_pipeline(const char *namespace_c_str,
+ const char *component_c_str,
+ const char *model_name_c_str,
+ bool use_kv_routing,
+ double busy_threshold,
+ double overlap_score_weight,
+ double router_temperature,
+ bool use_kv_events,
+ bool router_replica_sync,
+ bool enforce_disagg,
+ WorkerSelectionPipeline **pipeline_out);
+
+dynamo_llm_result_t dynamo_destroy_worker_selection_pipeline(WorkerSelectionPipeline *pipeline);
+
+dynamo_llm_result_t dynamo_query_worker_selection_and_annotate(WorkerSelectionPipeline *pipeline,
+ const char *request_json_c_str,
+ int64_t *decode_worker_id_out,
+ int64_t *prefill_worker_id_out,
+ uint32_t **token_ids_out,
+ size_t *token_count_out,
+ char **annotated_request_json_out);
+
+dynamo_llm_result_t dynamo_free_worker_selection_result(uint32_t *token_ids,
+ size_t token_count,
+ char *annotated_request_json);
+
+// Router bookkeeping functions for GAIE integration
+dynamo_llm_result_t dynamo_router_add_request(WorkerSelectionPipeline *pipeline,
+ const char *request_id_c_str,
+ const uint32_t *token_ids,
+ size_t token_count,
+ uint64_t worker_id,
+ uint32_t dp_rank);
+
+dynamo_llm_result_t dynamo_router_mark_prefill_complete(WorkerSelectionPipeline *pipeline,
+ const char *request_id_c_str);
+
+dynamo_llm_result_t dynamo_router_free_request(WorkerSelectionPipeline *pipeline,
+ const char *request_id_c_str);
+*/
+import "C"
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "os"
+ "strings"
+ "sync"
+ "unsafe"
+
+ log "sigs.k8s.io/controller-runtime/pkg/log"
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
+ schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
+ logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
+)
+
+const (
+ PluginName = "dynamo-kv-scorer"
+ KVAwareScorerType = "kv-aware-scorer"
+ StateKeyWorkerInstanceID = schedtypes.StateKey("dynamo/worker-instance-id")
+ StateKeyPrefillWorkerID = schedtypes.StateKey("dynamo/prefill-worker-id")
+ StateKeyRequestID = schedtypes.StateKey("dynamo/request-id")
+ WorkerIDHeader = "x-worker-instance-id"
+ PrefillWorkerIDHeader = "x-prefiller-host-port"
+ tokenDataAnnotationKey = "dynamo/token-data"
+)
+
+// --------------------------- config / env ---------------------------
+
+var warmupOnce sync.Once
+var warmupErr error
+
+type stateString string
+type params struct {
+}
+
+func (s stateString) Clone() schedtypes.StateData { return s }
+
+type KVAwareScorer struct {
+ typedName plugins.TypedName
+}
+
+var _ plugins.Plugin = (*KVAwareScorer)(nil)
+var _ framework.Scorer = (*KVAwareScorer)(nil)
+
+func NewKVAwareScorer() *KVAwareScorer {
+ return &KVAwareScorer{
+ typedName: plugins.TypedName{Type: KVAwareScorerType, Name: PluginName},
+ }
+}
+
+func (k *KVAwareScorer) WithName(name string) *KVAwareScorer { k.typedName.Name = name; return k }
+
+func KVAwareScorerFactory(name string, raw json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
+ p := params{}
+ _ = json.Unmarshal(raw, &p)
+
+ s := NewKVAwareScorer().WithName(name)
+
+ // one-time FFI init (runtime + persistent pipeline)
+ warmupOnce.Do(func() {
+ defer func() {
+ if r := recover(); r != nil {
+ warmupErr = fmt.Errorf("Dynamo configuration error: %v", r)
+ }
+ }()
+ warmupErr = initFFI()
+ })
+ if warmupErr != nil {
+ return nil, fmt.Errorf("Dynamo FFI init for the Router failed: %w", warmupErr)
+ }
+
+ return s, nil
+}
+
+func (k *KVAwareScorer) TypedName() plugins.TypedName { return k.typedName }
+
+// --------------------------- FFI integration ---------------------------
+
+var (
+ ffiOnce sync.Once
+ ffiErr error
+
+ ffiNamespace string
+ ffiComponent string
+ ffiModel string
+ ffiOverlapScoreWeight float64
+ ffiRouterTemperature float64
+ ffiKvBlockSize uint32
+ ffiWorkerID int64
+ ffiEnforceDisagg bool
+
+ runtimeInitialized bool
+
+ // Boxed pipeline handle (owned on the Rust side, opaque here)
+ pipeline *C.struct_WorkerSelectionPipeline
+ pipelineMutex sync.RWMutex
+)
+
+func loadDynamoConfig() {
+ ffiNamespace = getEnvOrDefault("DYNAMO_NAMESPACE", "vllm-agg")
+ ffiComponent = getEnvOrDefault("DYNAMO_COMPONENT", "backend")
+ ffiModel = getEnvOrDefault("DYNAMO_MODEL", "Qwen/Qwen3-0.6B")
+ ffiWorkerID = getEnvInt64OrDefault("DYNAMO_WORKER_ID", 1)
+ ffiEnforceDisagg = getEnvBoolOrDefault("DYNAMO_ENFORCE_DISAGG", false)
+
+ ffiOverlapScoreWeight = getEnvFloatOrDefault("DYNAMO_OVERLAP_SCORE_WEIGHT", -1.0)
+ ffiRouterTemperature = getEnvFloatOrDefault("DYNAMO_ROUTER_TEMPERATURE", -1.0)
+
+ kvBlockSizeStr := os.Getenv("DYNAMO_KV_BLOCK_SIZE")
+ if kvBlockSizeStr == "" {
+ panic("DYNAMO_KV_BLOCK_SIZE is required and must match the model card's kv_cache_block_size")
+ }
+ var tmp int64
+ if n, err := fmt.Sscanf(kvBlockSizeStr, "%d", &tmp); err != nil || n != 1 {
+ panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE='%s' is not a valid integer", kvBlockSizeStr))
+ }
+ ffiKvBlockSize = uint32(tmp)
+ if ffiKvBlockSize < 16 || ffiKvBlockSize > 8192 {
+ panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d outside [16,8192]", ffiKvBlockSize))
+ }
+ if (ffiKvBlockSize & (ffiKvBlockSize - 1)) != 0 {
+ panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d must be a power of 2", ffiKvBlockSize))
+ }
+ fmt.Printf("Dynamo KV Scorer: Loaded DYNAMO_KV_BLOCK_SIZE=%d\n", ffiKvBlockSize)
+}
+
+func getEnvOrDefault(key, def string) string {
+ if v := os.Getenv(key); v != "" {
+ return v
+ }
+ return def
+}
+func getEnvInt64OrDefault(key string, def int64) int64 {
+ if v := os.Getenv(key); v != "" {
+ var p int64
+ if n, err := fmt.Sscanf(v, "%d", &p); err == nil && n == 1 {
+ return p
+ }
+ }
+ return def
+}
+func getEnvFloatOrDefault(key string, def float64) float64 {
+ if v := os.Getenv(key); v != "" {
+ var p float64
+ if n, err := fmt.Sscanf(v, "%f", &p); err == nil && n == 1 {
+ return p
+ }
+ }
+ return def
+}
+func getEnvBoolOrDefault(key string, def bool) bool {
+ if v := os.Getenv(key); v != "" {
+ switch strings.ToLower(v) {
+ case "true", "1", "yes", "on":
+ return true
+ case "false", "0", "no", "off":
+ return false
+ }
+ }
+ return def
+}
+
+// initFFI: initialize runtime and create a persistent boxed pipeline.
+func initFFI() error {
+ ffiOnce.Do(func() {
+ loadDynamoConfig()
+
+ ns := C.CString(ffiNamespace)
+ cm := C.CString(ffiComponent)
+ model := C.CString(ffiModel)
+ defer C.free(unsafe.Pointer(ns))
+ defer C.free(unsafe.Pointer(cm))
+ defer C.free(unsafe.Pointer(model))
+
+ // Init Dynamo runtime
+ if rc := C.dynamo_llm_init(ns, cm, C.int64_t(ffiWorkerID), C.uint32_t(ffiKvBlockSize)); rc != C.DYNAMO_OK {
+ ffiErr = fmt.Errorf("dynamo_llm_init failed")
+ return
+ }
+ runtimeInitialized = true
+
+ // Create persistent pipeline
+ pipelineMutex.Lock()
+ defer pipelineMutex.Unlock()
+
+ rc := C.dynamo_create_worker_selection_pipeline(
+ ns,
+ cm,
+ model,
+ C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_ROUTING", true)),
+ C.double(getEnvFloatOrDefault("DYNAMO_BUSY_THRESHOLD", -1.0)),
+ C.double(ffiOverlapScoreWeight),
+ C.double(ffiRouterTemperature),
+ C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_EVENTS", true)),
+ C.bool(getEnvBoolOrDefault("DYNAMO_ROUTER_REPLICA_SYNC", true)),
+ C.bool(ffiEnforceDisagg),
+ &pipeline,
+ )
+ if rc != C.DYNAMO_OK {
+ ffiErr = fmt.Errorf("dynamo_create_worker_selection_pipeline failed")
+ return
+ }
+ })
+ return ffiErr
+}
+
+// --------------------------- scoring ---------------------------
+
+func (k *KVAwareScorer) Score(
+ ctx context.Context,
+ cycle *schedtypes.CycleState,
+ req *schedtypes.LLMRequest,
+ pods []schedtypes.Pod,
+) map[schedtypes.Pod]float64 {
+ logger := log.FromContext(ctx)
+
+ workerID, prefillWorkerID, tokenData, err := k.callDynamoRouter(ctx, req)
+ if err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "Dynamo call failed; proceeding without worker id")
+ } else if workerID != "" {
+ logger.V(logutil.DEFAULT).Info(
+ "Dynamo router selected worker",
+ "workerID", workerID,
+ "prefillWorkerID", prefillWorkerID,
+ "tokenDataCount", len(tokenData),
+ "tokenData", tokenData,
+ )
+ cycle.Write(StateKeyWorkerInstanceID, stateString(workerID))
+ if req.Headers == nil {
+ req.Headers = map[string]string{}
+ }
+ req.Headers[WorkerIDHeader] = workerID
+
+ // Set prefill worker ID if present
+ if prefillWorkerID != "" {
+ cycle.Write(StateKeyPrefillWorkerID, stateString(prefillWorkerID))
+ req.Headers[PrefillWorkerIDHeader] = prefillWorkerID
+ }
+
+ if len(tokenData) > 0 {
+ if req.Annotations == nil {
+ req.Annotations = map[string]any{}
+ }
+ copied := make([]int64, len(tokenData))
+ copy(copied, tokenData)
+ req.Annotations[tokenDataAnnotationKey] = copied
+ }
+
+ // GAIE Stage 1: Register request with router bookkeeping
+ // The request ID comes from Envoy's request ID header
+ requestID := req.RequestId
+ if requestID != "" {
+ cycle.Write(StateKeyRequestID, stateString(requestID))
+ if addErr := k.callAddRequest(ctx, requestID, tokenData, workerID, prefillWorkerID); addErr != nil {
+ logger.V(logutil.DEFAULT).Error(addErr, "Failed to add request to router bookkeeping",
+ "requestID", requestID)
+ }
+ } else {
+ logger.V(logutil.VERBOSE).Info("No request ID available, skipping router bookkeeping")
+ }
+ }
+
+ out := make(map[schedtypes.Pod]float64, len(pods))
+ for _, p := range pods {
+ out[p] = 1.0
+ }
+ return out
+}
+
+// --------------------------- router call (persistent only) ---------------------------
+
+func (k *KVAwareScorer) callDynamoRouter(
+ ctx context.Context,
+ req *schedtypes.LLMRequest,
+) (workerID string, prefillWorkerID string, tokenData []int64, err error) {
+ logger := log.FromContext(ctx)
+
+ if err := initFFI(); err != nil {
+ logger.V(logutil.DEFAULT).Error(err, "FFI init failed")
+ return "", "", nil, err
+ }
+ if !runtimeInitialized {
+ return "", "", nil, fmt.Errorf("dynamo runtime not initialized")
+ }
+
+ pipelineMutex.RLock()
+ currentPipeline := pipeline
+ pipelineMutex.RUnlock()
+
+ if currentPipeline == nil {
+ return "", "", nil, fmt.Errorf("dynamo worker selection pipeline not created")
+ }
+
+ // Build OpenAI-compatible JSON request
+ requestBody := buildOpenAIRequest(req)
+ requestJSON, jsonErr := json.Marshal(requestBody)
+ if jsonErr != nil {
+ logger.V(logutil.DEFAULT).Error(jsonErr, "Failed to marshal OpenAI request")
+ return "", "", nil, fmt.Errorf("marshal OpenAI request: %w", jsonErr)
+ }
+ cRequestJSON := C.CString(string(requestJSON))
+ defer C.free(unsafe.Pointer(cRequestJSON))
+
+ // Output variables
+ var cDecodeWorkerID C.int64_t
+ var cPrefillWorkerID C.int64_t
+ var cTokens *C.uint32_t
+ var cTokenCount C.size_t
+ var cAnnotatedJSON *C.char
+
+ // Call the worker selection pipeline
+ rc := C.dynamo_query_worker_selection_and_annotate(
+ currentPipeline,
+ cRequestJSON,
+ &cDecodeWorkerID,
+ &cPrefillWorkerID,
+ &cTokens,
+ &cTokenCount,
+ &cAnnotatedJSON,
+ )
+ if rc != C.DYNAMO_OK {
+ return "", "", nil, fmt.Errorf("dynamo_query_worker_selection_and_annotate failed")
+ }
+
+ // Copy tokens into Go memory and free C memory
+ count := int(uintptr(cTokenCount))
+ var tokens64 []int64
+ if count > 0 && cTokens != nil {
+ src := unsafe.Slice((*uint32)(unsafe.Pointer(cTokens)), count)
+ tokens64 = make([]int64, count)
+ for i := 0; i < count; i++ {
+ tokens64[i] = int64(src[i])
+ }
+ }
+ C.dynamo_free_worker_selection_result(cTokens, cTokenCount, cAnnotatedJSON)
+
+ workerIDStr := fmt.Sprintf("%d", int64(cDecodeWorkerID))
+ prefillWorkerIDStr := ""
+ // Rust returns -1 for prefill_worker_id when not in disaggregated mode
+ if int64(cPrefillWorkerID) >= 0 {
+ prefillWorkerIDStr = fmt.Sprintf("%d", int64(cPrefillWorkerID))
+ }
+ logger.V(logutil.DEFAULT).Info("Worker selection completed",
+ "workerID", workerIDStr, "prefillWorkerID", prefillWorkerIDStr, "tokenCount", count)
+
+ return workerIDStr, prefillWorkerIDStr, tokens64, nil
+}
+
+func buildOpenAIRequest(req *schedtypes.LLMRequest) map[string]any {
+ requestBody := make(map[string]any)
+ userText := "default prompt"
+ if req != nil && strings.TrimSpace(req.Prompt) != "" {
+ userText = req.Prompt
+ }
+ requestBody["messages"] = []map[string]any{{"role": "user", "content": userText}}
+ if req != nil && strings.TrimSpace(req.TargetModel) != "" {
+ requestBody["model"] = req.TargetModel
+ } else {
+ requestBody["model"] = ffiModel
+ }
+ requestBody["max_tokens"] = 1
+ requestBody["temperature"] = 0.0
+ requestBody["stream"] = true
+ requestBody["nvext"] = map[string]any{
+ "annotations": []string{"query_instance_id"},
+ }
+ return requestBody
+}
+
+// --------------------------- router bookkeeping ---------------------------
+
+// callAddRequest registers a request with the router's bookkeeping.
+// This should be called after worker selection to track active requests.
+func (k *KVAwareScorer) callAddRequest(
+ ctx context.Context,
+ requestID string,
+ tokenData []int64,
+ workerID string,
+ prefillWorkerID string,
+) error {
+ logger := log.FromContext(ctx)
+
+ if !runtimeInitialized {
+ return fmt.Errorf("dynamo runtime not initialized")
+ }
+
+ pipelineMutex.RLock()
+ currentPipeline := pipeline
+ pipelineMutex.RUnlock()
+
+ if currentPipeline == nil {
+ return fmt.Errorf("dynamo worker selection pipeline not created")
+ }
+
+ // Parse worker ID (use decode worker for bookkeeping in disagg mode)
+ var workerIDUint uint64
+ if _, err := fmt.Sscanf(workerID, "%d", &workerIDUint); err != nil {
+ return fmt.Errorf("invalid worker ID: %s", workerID)
+ }
+
+ // Convert token data from int64 to uint32
+ tokens := make([]uint32, len(tokenData))
+ for i, t := range tokenData {
+ tokens[i] = uint32(t)
+ }
+
+ cRequestID := C.CString(requestID)
+ defer C.free(unsafe.Pointer(cRequestID))
+
+ var cTokens *C.uint32_t
+ if len(tokens) > 0 {
+ cTokens = (*C.uint32_t)(unsafe.Pointer(&tokens[0]))
+ }
+
+ rc := C.dynamo_router_add_request(
+ currentPipeline,
+ cRequestID,
+ cTokens,
+ C.size_t(len(tokens)),
+ C.uint64_t(workerIDUint),
+ C.uint32_t(0), // dp_rank = 0 for now
+ )
+
+ if rc != C.DYNAMO_OK {
+ return fmt.Errorf("dynamo_router_add_request failed")
+ }
+
+ logger.V(logutil.VERBOSE).Info("Added request to router bookkeeping",
+ "requestID", requestID, "workerID", workerID, "tokenCount", len(tokens))
+ return nil
+}
+
+// CallMarkPrefillComplete marks prefill as completed for a request.
+// Exported for use by response handlers.
+func CallMarkPrefillComplete(requestID string) error {
+ if !runtimeInitialized {
+ return fmt.Errorf("dynamo runtime not initialized")
+ }
+
+ pipelineMutex.RLock()
+ currentPipeline := pipeline
+ pipelineMutex.RUnlock()
+
+ if currentPipeline == nil {
+ return fmt.Errorf("dynamo worker selection pipeline not created")
+ }
+
+ cRequestID := C.CString(requestID)
+ defer C.free(unsafe.Pointer(cRequestID))
+
+ rc := C.dynamo_router_mark_prefill_complete(currentPipeline, cRequestID)
+ if rc != C.DYNAMO_OK {
+ return fmt.Errorf("dynamo_router_mark_prefill_complete failed")
+ }
+ return nil
+}
+
+// CallFreeRequest cleans up router state for a completed/cancelled request.
+// Exported for use by response handlers.
+func CallFreeRequest(requestID string) error {
+ if !runtimeInitialized {
+ return fmt.Errorf("dynamo runtime not initialized")
+ }
+
+ pipelineMutex.RLock()
+ currentPipeline := pipeline
+ pipelineMutex.RUnlock()
+
+ if currentPipeline == nil {
+ return fmt.Errorf("dynamo worker selection pipeline not created")
+ }
+
+ cRequestID := C.CString(requestID)
+ defer C.free(unsafe.Pointer(cRequestID))
+
+ rc := C.dynamo_router_free_request(currentPipeline, cRequestID)
+ if rc != C.DYNAMO_OK {
+ return fmt.Errorf("dynamo_router_free_request failed")
+ }
+ return nil
+}
+
+// --------------------------- shutdown ---------------------------
+
+func cleanupDynamo() error {
+ pipelineMutex.Lock()
+ defer pipelineMutex.Unlock()
+
+ if pipeline != nil {
+ if rc := C.dynamo_destroy_worker_selection_pipeline(pipeline); rc != C.DYNAMO_OK {
+ fmt.Printf("Warning: dynamo_destroy_worker_selection_pipeline failed\n")
+ }
+ pipeline = nil
+ }
+
+ if runtimeInitialized {
+ if rc := C.dynamo_llm_shutdown(); rc != C.DYNAMO_OK {
+ return fmt.Errorf("dynamo_llm_shutdown failed")
+ }
+ runtimeInitialized = false
+ }
+ return nil
+}
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
index 2962117..7da1d43 100644
--- a/pkg/epp/scheduling/types/types.go
+++ b/pkg/epp/scheduling/types/types.go
@@ -33,10 +33,12 @@ type LLMRequest struct {
Prompt string
// Headers is a map of the request headers.
Headers map[string]string
+ // Annotations provides plugin-specific data that should travel alongside the request.
+ Annotations map[string]any
}
func (r *LLMRequest) String() string {
- return fmt.Sprintf("RequestID: %s, TargetModel: %s, PromptLength: %d, Headers: %v", r.RequestId, r.TargetModel, len(r.Prompt), r.Headers)
+ return fmt.Sprintf("RequestID: %s, TargetModel: %s, PromptLength: %d, Headers: %v, Annotations: %v", r.RequestId, r.TargetModel, len(r.Prompt), r.Headers, r.Annotations)
}
type Pod interface {
# SPDX-FileCopyrightText: Copyright The Kubernetes Authors.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modifications Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES
# Dynamo EPP Dockerfile
# Builds a custom EPP image with Dynamo KV-aware routing plugins
#
# PREREQUISITES: Run `make dynamo-lib` before building this image to ensure
# the Dynamo FFI library and headers are in place.
ARG BUILDER_IMAGE=golang:1.24-bookworm
ARG BASE_IMAGE=ubuntu:24.04
# =============================================================================
# Build stage
# =============================================================================
FROM ${BUILDER_IMAGE} AS builder
# Docker buildx provides these automatically for multi-platform builds
ARG TARGETOS=linux
ARG TARGETARCH
ARG COMMIT_SHA
ARG BUILD_REF
WORKDIR /workspace
# Install build dependencies for CGO
RUN apt-get update && apt-get install -y \
gcc \
g++ \
libc-dev \
&& rm -rf /var/lib/apt/lists/*
# Copy go mod files first for better caching
COPY go.mod go.sum ./
RUN go mod download
# Copy the source code (including pre-built Dynamo library)
COPY . .
# Verify Dynamo library exists
RUN if [ ! -f "pkg/plugins/dynamo_kv_scorer/lib/libdynamo_llm_capi.a" ]; then \
echo "ERROR: Dynamo library not found!"; \
echo "Run 'make dynamo-lib' before building the Docker image."; \
exit 1; \
fi
# Build with CGO enabled for the Dynamo FFI
# Use TARGETOS/TARGETARCH from Docker buildx for proper platform support
RUN CGO_ENABLED=1 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build \
-ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.GitVersion=${BUILD_REF} \
-X sigs.k8s.io/gateway-api-inference-extension/version.GitCommit=${COMMIT_SHA}" \
-o epp ./cmd/epp
# =============================================================================
# Runtime stage
# =============================================================================
FROM ${BASE_IMAGE}
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
ca-certificates \
libstdc++6 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /
# Copy the binary from builder
COPY --from=builder /workspace/epp .
# Note: EPP config is mounted via Kubernetes ConfigMap at runtime
# See helm/dynamo-gaie/templates/epp-configmap.yaml
# Create non-root user
RUN useradd -r -u 65532 -g nogroup nonroot
USER 65532:65534
ENTRYPOINT ["/epp"]
# Dynamo EPP Makefile
# Builds custom EPP image with Dynamo KV-aware routing plugins
# Image configuration
# Image lives in local cache only, not pushed to any registry
DOCKER_SERVER ?= dynamo
IMAGE_NAME := dynamo-epp
GIT_COMMIT_SHA ?= $(shell git rev-parse HEAD 2>/dev/null || echo "unknown")
GIT_TAG ?= $(shell git describe --tags --dirty --always 2>/dev/null || echo "dev")
IMAGE_REPO ?= $(DOCKER_SERVER)/$(IMAGE_NAME)
IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
# Build configuration
# Auto-detect host architecture for consistent builds with Dynamo library
# The Dynamo library is built for the host arch, so Docker must match
HOST_ARCH := $(shell uname -m)
ifeq ($(HOST_ARCH),x86_64)
PLATFORMS ?= linux/amd64
else ifeq ($(HOST_ARCH),aarch64)
PLATFORMS ?= linux/arm64
else ifeq ($(HOST_ARCH),arm64)
PLATFORMS ?= linux/arm64
else
PLATFORMS ?= linux/amd64
endif
# Docker proxy for avoiding rate limits (e.g., ECR mirror)
# Set DOCKER_PROXY to prefix base images, e.g., DOCKER_PROXY=my-registry.com/dockerhub/
DOCKER_PROXY ?=
DOCKER_BUILDX_CMD ?= docker buildx
IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
BUILDER_IMAGE ?= $(DOCKER_PROXY)golang:1.24
BASE_IMAGE ?= $(DOCKER_PROXY)ubuntu:24.04
# Container tool
CONTAINER_TOOL ?= docker
# Kind cluster name for local testing
KIND_CLUSTER ?= kind
# Project directory
PROJECT_DIR := $(shell pwd)
# Dynamo directories
# Default: assume we're in dynamo/deploy/inference-gateway/epp
DYNAMO_DIR ?= $(shell cd $(PROJECT_DIR)/../../.. && pwd)
DYNAMO_LIB_DIR := $(PROJECT_DIR)/pkg/plugins/dynamo_kv_scorer/lib
DYNAMO_INCLUDE_DIR := $(PROJECT_DIR)/pkg/plugins/dynamo_kv_scorer/include
.PHONY: help
help: ## Display this help
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
##@ Development
.PHONY: fmt
fmt: ## Run go fmt
go fmt ./...
.PHONY: vet
vet: ## Run go vet
go vet ./...
.PHONY: tidy
tidy: ## Run go mod tidy
go mod tidy
.PHONY: test
test: ## Run tests
CGO_ENABLED=1 go test ./... -v
##@ Build
.PHONY: build
build: dynamo-lib-check ## Build the EPP binary locally (requires CGO and Dynamo libraries)
CGO_ENABLED=1 go build -o bin/epp ./cmd/epp
.PHONY: build-with-lib
build-with-lib: dynamo-lib build ## Build Dynamo library and EPP binary
.PHONY: image-build
image-build: dynamo-lib-check ## Build the Docker image using buildx
$(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \
--platform=$(PLATFORMS) \
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
--build-arg COMMIT_SHA=$(GIT_COMMIT_SHA) \
--build-arg BUILD_REF=$(GIT_TAG) \
$(PUSH) \
$(LOAD) \
.
.PHONY: image-push
image-push: PUSH=--push ## Build and push the Docker image
image-push: image-build
.PHONY: image-load
image-load: LOAD=--load ## Build and load the Docker image locally
image-load: image-build
.PHONY: image-kind
image-kind: image-load ## Build and load the image into kind cluster
kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
##@ Local Development with Buildx
.PHONY: image-local-build
image-local-build: ## Build image using a new buildx builder
BUILDER=$$($(DOCKER_BUILDX_CMD) create --use) && \
$(MAKE) image-build PUSH=$(PUSH) LOAD=$(LOAD) && \
$(DOCKER_BUILDX_CMD) rm $$BUILDER
.PHONY: image-local-push
image-local-push: PUSH=--push ## Build and push using local buildx builder
image-local-push: image-local-build
.PHONY: image-local-load
image-local-load: LOAD=--load ## Build and load using local buildx builder
image-local-load: image-local-build
##@ Dynamo Library Build
.PHONY: dynamo-lib
dynamo-lib: ## Build Dynamo static library and copy to project
@echo "Building Dynamo static library..."
cd "$(DYNAMO_DIR)" && cargo build --release -p libdynamo_llm
@echo "Generating C header..."
@mkdir -p "$(DYNAMO_DIR)/lib/bindings/c/include/nvidia/dynamo_llm"
cd "$(DYNAMO_DIR)" && \
(cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm \
--output lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h || \
cp lib/bindings/c/src/fallback_header.h lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h)
@echo "Copying files to EPP project..."
@mkdir -p "$(DYNAMO_LIB_DIR)"
@mkdir -p "$(DYNAMO_INCLUDE_DIR)"
cp "$(DYNAMO_DIR)/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h" "$(DYNAMO_INCLUDE_DIR)/"
cp "$(DYNAMO_DIR)/target/release/libdynamo_llm_capi.a" "$(DYNAMO_LIB_DIR)/"
@echo "Dynamo library ready!"
.PHONY: dynamo-lib-check
dynamo-lib-check: ## Check if Dynamo library files exist
@if [ ! -f "$(DYNAMO_LIB_DIR)/libdynamo_llm_capi.a" ]; then \
echo "ERROR: Dynamo library not found. Run 'make dynamo-lib' first."; \
exit 1; \
fi
@if [ ! -f "$(DYNAMO_INCLUDE_DIR)/llm_engine.h" ]; then \
echo "ERROR: Dynamo header not found. Run 'make dynamo-lib' first."; \
exit 1; \
fi
@echo "Dynamo library files found."
##@ Clean
.PHONY: clean
clean: ## Clean build artifacts
rm -rf bin/
go clean
##@ All-in-one Build
.PHONY: all
all: dynamo-lib image-local-load ## Build Dynamo lib and Docker image, load locally
.PHONY: all-push
all-push: dynamo-lib image-push ## Build Dynamo lib and Docker image, push to registry
.PHONY: all-kind
all-kind: dynamo-lib image-kind ## Build Dynamo lib and Docker image, load to kind
##@ Info
.PHONY: info
info: ## Show build info
@echo "Image Tag: $(IMAGE_TAG)"
@echo "Git Commit: $(GIT_COMMIT_SHA)"
@echo "Git Tag: $(GIT_TAG)"
@echo "Platforms: $(PLATFORMS)"
@echo "Docker Proxy: $(DOCKER_PROXY)"
@echo "Builder Image: $(BUILDER_IMAGE)"
@echo "Base Image: $(BASE_IMAGE)"
@echo "Dynamo Dir: $(DYNAMO_DIR)"
@echo "Dynamo Lib Dir: $(DYNAMO_LIB_DIR)"
@echo "Dynamo Include Dir: $(DYNAMO_INCLUDE_DIR)"
/*
Copyright 2025 NVIDIA Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Dynamo EPP - Custom Endpoint Picker Plugin for NVIDIA Dynamo
//
// This EPP integrates with the Gateway API Inference Extension to provide
// KV-aware routing for Dynamo inference backends.
//
// # Header-Based Routing
//
// The Dynamo KV scorer sets routing headers that the Lua filter at the
// gateway uses to inject nvext into the request body:
//
// - x-worker-instance-id: Selected worker ID (decode worker in disagg mode)
// - x-prefiller-host-port: Prefill worker ID (disaggregated mode only)
// - x-dynamo-routing-mode: "aggregated" or "disaggregated"
//
// The Lua filter reads these headers and injects:
// - Aggregated: {"nvext": {"backend_instance_id": <worker_id>}}
// - Disaggregated: {"nvext": {"prefill_worker_id": <prefill>, "decode_worker_id": <decode>}}
package main
import (
"os"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
// Dynamo plugins
dynscorer "github.com/nvidia/dynamo/deploy/inference-gateway/pkg/plugins/dynamo_kv_scorer"
)
func main() {
// Register Dynamo custom plugins:
// - kv-aware-scorer: Implements Scorer, PreRequest, and ResponseComplete interfaces
// - Score: Calls Dynamo router to select workers based on KV cache, sets routing headers
// - PreRequest: Registers request with router bookkeeping after scheduling is finalized
// - ResponseComplete: Cleans up router bookkeeping when response completes
plugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
// Run using standard GAIE runner (it registers built-in plugins automatically)
if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
os.Exit(1)
}
}
module github.com/nvidia/dynamo/deploy/inference-gateway
go 1.24.0
require (
sigs.k8s.io/controller-runtime v0.22.4
sigs.k8s.io/gateway-api-inference-extension v1.2.1
)
require (
cel.dev/expr v0.24.0 // indirect
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/dennwc/varint v1.0.0 // indirect
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect
github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.21.2 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/cel-go v0.26.0 // indirect
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.4 // indirect
github.com/prometheus/procfs v0.17.0 // indirect
github.com/prometheus/prometheus v0.308.1 // indirect
github.com/spf13/cobra v1.9.1 // indirect
github.com/spf13/pflag v1.0.10 // indirect
github.com/stoewer/go-strcase v1.3.0 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
go.opentelemetry.io/otel v1.39.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 // indirect
go.opentelemetry.io/otel/metric v1.39.0 // indirect
go.opentelemetry.io/otel/sdk v1.39.0 // indirect
go.opentelemetry.io/otel/trace v1.39.0 // indirect
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.1 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/exp v0.0.0-20250808145144-a408d31f581a // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/oauth2 v0.32.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.39.0 // indirect
golang.org/x/term v0.37.0 // indirect
golang.org/x/text v0.31.0 // indirect
golang.org/x/time v0.13.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect
google.golang.org/grpc v1.78.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/api v0.34.3 // indirect
k8s.io/apiextensions-apiserver v0.34.3 // indirect
k8s.io/apimachinery v0.34.3 // indirect
k8s.io/apiserver v0.34.3 // indirect
k8s.io/client-go v0.34.3 // indirect
k8s.io/component-base v0.34.3 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)
// NOTE: For local development, uncomment the replace directive below.
// For Docker builds, keep it commented out to use the published v1.2.1 release.
// replace sigs.k8s.io/gateway-api-inference-extension => ../../../gaie_latest/gateway-api-inference-extension
cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY=
cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
cloud.google.com/go/auth v0.17.0 h1:74yCm7hCj2rUyyAocqnFzsAYXgJhrG26XCFimrc/Kz4=
cloud.google.com/go/auth v0.17.0/go.mod h1:6wv/t5/6rOPAX4fJiRjKkJCvswLwdet7G8+UGXt7nCQ=
cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc=
cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c=
cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1 h1:5YTBM8QDVIBN3sxBil89WfdAAqDZbyJTgh688DSxX5w=
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.19.1/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.12.0 h1:wL5IEG5zb7BVv1Kv0Xm92orq+5hB5Nipn3B5tn4Rqfk=
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.12.0/go.mod h1:J7MUC/wtRpfGVbQ5sIItY5/FuVWmvzlY21WAOfQnq/I=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA=
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI=
github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0 h1:XkkQbfMyuH2jTSjQjSoihryI8GINRcs4xp8lNawg0FI=
github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk=
github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vSQ6PWWSL9lK8qwHozUj03+zLoEB8O0=
github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs=
github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
github.com/aws/aws-sdk-go-v2 v1.39.6 h1:2JrPCVgWJm7bm83BDwY5z8ietmeJUbh3O2ACnn+Xsqk=
github.com/aws/aws-sdk-go-v2 v1.39.6/go.mod h1:c9pm7VwuW0UPxAEYGyTmyurVcNrbF6Rt/wixFqDhcjE=
github.com/aws/aws-sdk-go-v2/config v1.31.17 h1:QFl8lL6RgakNK86vusim14P2k8BFSxjvUkcWLDjgz9Y=
github.com/aws/aws-sdk-go-v2/config v1.31.17/go.mod h1:V8P7ILjp/Uef/aX8TjGk6OHZN6IKPM5YW6S78QnRD5c=
github.com/aws/aws-sdk-go-v2/credentials v1.18.21 h1:56HGpsgnmD+2/KpG0ikvvR8+3v3COCwaF4r+oWwOeNA=
github.com/aws/aws-sdk-go-v2/credentials v1.18.21/go.mod h1:3YELwedmQbw7cXNaII2Wywd+YY58AmLPwX4LzARgmmA=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13 h1:T1brd5dR3/fzNFAQch/iBKeX07/ffu/cLu+q+RuzEWk=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.13/go.mod h1:Peg/GBAQ6JDt+RoBf4meB1wylmAipb7Kg2ZFakZTlwk=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 h1:a+8/MLcWlIxo1lF9xaGt3J/u3yOZx+CdSveSNwjhD40=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13/go.mod h1:oGnKwIYZ4XttyU2JWxFrwvhF6YKiK/9/wmE3v3Iu9K8=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 h1:HBSI2kDkMdWz4ZM7FjwE7e/pWDEZ+nR95x8Ztet1ooY=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13/go.mod h1:YE94ZoDArI7awZqJzBAZ3PDD2zSfuP7w6P2knOzIn8M=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 h1:x2Ibm/Af8Fi+BH+Hsn9TXGdT+hKbDd5XOTZxTMxDk7o=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3/go.mod h1:IW1jwyrQgMdhisceG8fQLmQIydcT/jWY21rFhzgaKwo=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 h1:kDqdFvMY4AtKoACfzIGD8A0+hbT41KTKF//gq7jITfM=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13/go.mod h1:lmKuogqSU3HzQCwZ9ZtcqOc5XGMqtDK7OIc2+DxiUEg=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.1 h1:0JPwLz1J+5lEOfy/g0SURC9cxhbQ1lIMHMa+AHZSzz0=
github.com/aws/aws-sdk-go-v2/service/sso v1.30.1/go.mod h1:fKvyjJcz63iL/ftA6RaM8sRCtN4r4zl4tjL3qw5ec7k=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.5 h1:OWs0/j2UYR5LOGi88sD5/lhN6TDLG6SfA7CqsQO9zF0=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.5/go.mod h1:klO+ejMvYsB4QATfEOIXk8WAEwN4N0aBfJpvC+5SZBo=
github.com/aws/aws-sdk-go-v2/service/sts v1.39.1 h1:mLlUgHn02ue8whiR4BmxxGJLR2gwU6s6ZzJ5wDamBUs=
github.com/aws/aws-sdk-go-v2/service/sts v1.39.1/go.mod h1:E19xDjpzPZC7LS2knI9E6BaRFDK43Eul7vd6rSq2HWk=
github.com/aws/smithy-go v1.23.2 h1:Crv0eatJUQhaManss33hS5r40CG3ZFH+21XSkqMrIUM=
github.com/aws/smithy-go v1.23.2/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0=
github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 h1:6df1vn4bBlDDo4tARvBm7l6KA9iVMnE3NWizDeWSrps=
github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3/go.mod h1:CIWtjkly68+yqLPbvwwR/fjNJA/idrtULjZWh2v1ys0=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f h1:Y8xYupdHxryycyPlc9Y+bSQAYZnetRJ70VMVKm5CKI0=
github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f/go.mod h1:HlzOvOjVBOfTGSRXRyY0OiCS/3J1akRGQQpRO/7zyF4=
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dennwc/varint v1.0.0 h1:kGNFFSSw8ToIy3obO/kKr8U9GZYUAxQEVuix4zfDWzE=
github.com/dennwc/varint v1.0.0/go.mod h1:hnItb35rvZvJrbTALZtY/iQfDs48JKRG1RPpgziApxA=
github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes=
github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g=
github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98=
github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8=
github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU=
github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k=
github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ=
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
github.com/go-openapi/jsonpointer v0.21.2 h1:AqQaNADVwq/VnkCmQg6ogE+M3FOsKTytwges0JdwVuA=
github.com/go-openapi/jsonpointer v0.21.2/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk=
github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU=
github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/cel-go v0.26.0 h1:DPGjXackMpJWH680oGY4lZhYjIameYmR+/6RBdDGmaI=
github.com/google/cel-go v0.26.0/go.mod h1:A9O8OU9rdvrK5MQyrqfIxo1a0u4g3sF8KB6PUIaryMM=
github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo=
github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8 h1:ZI8gCoCjGzPsum4L21jHdQs8shFBIQih1TM9Rd/c+EQ=
github.com/google/pprof v0.0.0-20250923004556-9e5a51aed1e8/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U=
github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0=
github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU9uHLo7OnF5tL52HFAgMmyrf4=
github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA=
github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo=
github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc=
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo=
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 h1:cLN4IBkmkYZNnk7EAJ0BHIethd+J6LqxFNw5mSiI2bM=
github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg=
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4=
github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8=
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
github.com/oklog/ulid/v2 v2.1.1 h1:suPZ4ARWLOJLegGFiZZ1dFAkqzhMjL3J1TzI+5wHz8s=
github.com/oklog/ulid/v2 v2.1.1/go.mod h1:rcEKHmBBKfef9DhnvX7y1HZBYxjXb0cP5ExxNsTT1QQ=
github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns=
github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
github.com/prometheus/client_golang/exp v0.0.0-20251212205219-7ba246a648ca h1:BOxmsLoL2ymn8lXJtorca7N/m+2vDQUDoEtPjf0iAxA=
github.com/prometheus/client_golang/exp v0.0.0-20251212205219-7ba246a648ca/go.mod h1:gndBHh3ZdjBozGcGrjUYjN3UJLRS3l2drALtu4lUt+k=
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc=
github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI=
github.com/prometheus/otlptranslator v1.0.0 h1:s0LJW/iN9dkIH+EnhiD3BlkkP5QVIUVEoIwkU+A6qos=
github.com/prometheus/otlptranslator v1.0.0/go.mod h1:vRYWnXvI6aWGpsdY/mOT/cbeVRBlPWtBNDb7kGR3uKM=
github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0=
github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw=
github.com/prometheus/prometheus v0.308.1 h1:ApMNI/3/es3Ze90Z7CMb+wwU2BsSYur0m5VKeqHj7h4=
github.com/prometheus/prometheus v0.308.1/go.mod h1:aHjYCDz9zKRyoUXvMWvu13K9XHOkBB12XrEqibs3e0A=
github.com/prometheus/sigv4 v0.3.0 h1:QIG7nTbu0JTnNidGI1Uwl5AGVIChWUACxn2B/BQ1kms=
github.com/prometheus/sigv4 v0.3.0/go.mod h1:fKtFYDus2M43CWKMNtGvFNHGXnAJJEGZbiYCmVp/F8I=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs=
github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48=
go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 h1:f0cb2XPmrqn4XMy9PNliTgRKJgS5WcL/u0/WRYGz4t0=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0/go.mod h1:vnakAaFckOMiMtOIhFI2MNH4FYrZzXCYxmb1LlhoGz8=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 h1:in9O8ESIOlwJAEGTkkf34DesGRAc/Pn8qJ7k3r/42LM=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0/go.mod h1:Rp0EXBm5tfnv0WL+ARyO/PHBEaEAT8UUHQ6AGJcSq6c=
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 h1:8UPA4IbVZxpsD76ihGOQiFml99GPAEZLohDXvqHdi6U=
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0/go.mod h1:MZ1T/+51uIVKlRzGw1Fo46KEWThjlCBZKl2LzY5nv4g=
go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0=
go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs=
go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18=
go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE=
go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8=
go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew=
go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI=
go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA=
go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A=
go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4=
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc=
go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU=
golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc=
golang.org/x/exp v0.0.0-20250808145144-a408d31f581a h1:Y+7uR/b1Mw2iSXZ3G//1haIiSElDQZ8KWh0h+sZPG90=
golang.org/x/exp v0.0.0-20250808145144-a408d31f581a/go.mod h1:rT6SFzZ7oxADUDx58pcaKFTcZ+inxAa9fTrYx/uVYwg=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA=
golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
golang.org/x/oauth2 v0.32.0 h1:jsCblLleRMDrxMN29H3z/k1KliIvpLgCkE6R8FXXNgY=
golang.org/x/oauth2 v0.32.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk=
golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU=
golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI=
golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
google.golang.org/api v0.252.0 h1:xfKJeAJaMwb8OC9fesr369rjciQ704AjU/psjkKURSI=
google.golang.org/api v0.252.0/go.mod h1:dnHOv81x5RAmumZ7BWLShB/u7JZNeyalImxHmtTHxqw=
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls=
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww=
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc=
google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo=
gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.34.3 h1:D12sTP257/jSH2vHV2EDYrb16bS7ULlHpdNdNhEw2S4=
k8s.io/api v0.34.3/go.mod h1:PyVQBF886Q5RSQZOim7DybQjAbVs8g7gwJNhGtY5MBk=
k8s.io/apiextensions-apiserver v0.34.3 h1:p10fGlkDY09eWKOTeUSioxwLukJnm+KuDZdrW71y40g=
k8s.io/apiextensions-apiserver v0.34.3/go.mod h1:aujxvqGFRdb/cmXYfcRTeppN7S2XV/t7WMEc64zB5A0=
k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE=
k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
k8s.io/apiserver v0.34.3 h1:uGH1qpDvSiYG4HVFqc6A3L4CKiX+aBWDrrsxHYK0Bdo=
k8s.io/apiserver v0.34.3/go.mod h1:QPnnahMO5C2m3lm6fPW3+JmyQbvHZQ8uudAu/493P2w=
k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A=
k8s.io/client-go v0.34.3/go.mod h1:OxxeYagaP9Kdf78UrKLa3YZixMCfP6bgPwPwNBQBzpM=
k8s.io/component-base v0.34.3 h1:zsEgw6ELqK0XncCQomgO9DpUIzlrYuZYA0Cgo+JWpVk=
k8s.io/component-base v0.34.3/go.mod h1:5iIlD8wPfWE/xSHTRfbjuvUul2WZbI2nOUK65XL0E/c=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE=
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d h1:wAhiDyZ4Tdtt7e46e9M5ZSAJ/MnPGPs+Ki1gHw4w1R0=
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A=
sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8=
sigs.k8s.io/gateway-api-inference-extension v1.2.1 h1:kQjnFWW8YLCN42EZxDNxTuDE0xHkPkoyaEVpQ5sNCBQ=
sigs.k8s.io/gateway-api-inference-extension v1.2.1/go.mod h1:/HWeqxuOMjFM56YwJ2Spt3qceK7Spz4hk6ZfXYgE9a8=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 h1:JrhdFMqOd/+3ByqlP2I45kTOZmTRLBUm5pvRjeheg7E=
sigs.k8s.io/structured-merge-diff/v6 v6.3.1/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
/*
Copyright 2025 NVIDIA Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dynamo_kv_scorer
/*
#cgo CPPFLAGS: -I${SRCDIR}/include
#cgo CXXFLAGS: -std=c++17
#cgo LDFLAGS: ${SRCDIR}/lib/libdynamo_llm_capi.a -lstdc++ -ldl -lpthread -lm
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h> // for free
#include <stdbool.h>
// enum underlying type is uint32_t; matches cbindgen output
typedef uint32_t dynamo_llm_result_t;
enum { DYNAMO_OK = 0, DYNAMO_ERR = 1 };
// opaque handle forward-decl
struct WorkerSelectionPipeline;
typedef struct WorkerSelectionPipeline WorkerSelectionPipeline;
// Prototypes (C-compatible)
dynamo_llm_result_t dynamo_llm_init(const char *namespace_c_str,
const char *component_c_str,
int64_t worker_id,
uint32_t kv_block_size);
dynamo_llm_result_t dynamo_llm_shutdown(void);
dynamo_llm_result_t dynamo_llm_load_publisher_create(void);
dynamo_llm_result_t dynamo_kv_event_publish_stored(uint64_t event_id,
const uint32_t *token_ids,
const uintptr_t *num_block_tokens,
const uint64_t *block_ids,
size_t num_blocks,
const uint64_t *parent_hash,
uint64_t lora_id);
dynamo_llm_result_t dynamo_kv_event_publish_removed(uint64_t event_id,
const uint64_t *block_ids,
size_t num_blocks);
dynamo_llm_result_t dynamo_create_worker_selection_pipeline(const char *namespace_c_str,
const char *component_c_str,
const char *model_name_c_str,
bool use_kv_routing,
double busy_threshold,
double overlap_score_weight,
double router_temperature,
bool use_kv_events,
bool router_replica_sync,
bool enforce_disagg,
WorkerSelectionPipeline **pipeline_out);
dynamo_llm_result_t dynamo_destroy_worker_selection_pipeline(WorkerSelectionPipeline *pipeline);
dynamo_llm_result_t dynamo_query_worker_selection_and_annotate(WorkerSelectionPipeline *pipeline,
const char *request_json_c_str,
int64_t *decode_worker_id_out,
int64_t *prefill_worker_id_out,
uint32_t **token_ids_out,
size_t *token_count_out,
char **annotated_request_json_out);
dynamo_llm_result_t dynamo_free_worker_selection_result(uint32_t *token_ids,
size_t token_count,
char *annotated_request_json);
// Router bookkeeping functions for GAIE integration
dynamo_llm_result_t dynamo_router_add_request(WorkerSelectionPipeline *pipeline,
const char *request_id_c_str,
const uint32_t *token_ids,
size_t token_count,
uint64_t worker_id,
uint32_t dp_rank);
dynamo_llm_result_t dynamo_router_mark_prefill_complete(WorkerSelectionPipeline *pipeline,
const char *request_id_c_str);
dynamo_llm_result_t dynamo_router_free_request(WorkerSelectionPipeline *pipeline,
const char *request_id_c_str);
*/
import "C"
import (
"context"
"encoding/json"
"fmt"
"os"
"strings"
"sync"
"unsafe"
log "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
rc "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
schedtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
)
const (
PluginName = "dynamo-kv-scorer"
KVAwareScorerType = "kv-aware-scorer"
WorkerIDHeader = "x-worker-instance-id"
PrefillWorkerIDHeader = "x-prefill-instance-id"
RoutingModeHeader = "x-dynamo-routing-mode"
// stateKey is the key used to store routing state in PluginState
stateKey = "dynamo-routing-state"
)
// --------------------------- config / env ---------------------------
var warmupOnce sync.Once
var warmupErr error
type params struct{}
// DynamoRoutingState holds routing information passed from Score() to PreRequest().
// This is stored in PluginState keyed by request ID.
type DynamoRoutingState struct {
WorkerID string
PrefillWorkerID string
// TokenData holds the token IDs from the router.
// Currently unused but stored for future implementation where tokens
// may be passed to the worker via request body instead of headers.
TokenData []int64
}
// Clone implements plugins.StateData interface.
func (s *DynamoRoutingState) Clone() plugins.StateData {
if s == nil {
return nil
}
clone := &DynamoRoutingState{
WorkerID: s.WorkerID,
PrefillWorkerID: s.PrefillWorkerID,
}
if s.TokenData != nil {
clone.TokenData = make([]int64, len(s.TokenData))
copy(clone.TokenData, s.TokenData)
}
return clone
}
type KVAwareScorer struct {
typedName plugins.TypedName
pluginState *plugins.PluginState
}
var _ plugins.Plugin = (*KVAwareScorer)(nil)
var _ framework.Scorer = (*KVAwareScorer)(nil)
var _ rc.PreRequest = (*KVAwareScorer)(nil)
var _ rc.ResponseComplete = (*KVAwareScorer)(nil)
func NewKVAwareScorer(ctx context.Context) *KVAwareScorer {
return &KVAwareScorer{
typedName: plugins.TypedName{Type: KVAwareScorerType, Name: PluginName},
pluginState: plugins.NewPluginState(ctx),
}
}
func (k *KVAwareScorer) WithName(name string) *KVAwareScorer { k.typedName.Name = name; return k }
func KVAwareScorerFactory(name string, raw json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) {
p := params{}
_ = json.Unmarshal(raw, &p)
s := NewKVAwareScorer(handle.Context()).WithName(name)
// one-time FFI init (runtime + persistent pipeline)
warmupOnce.Do(func() {
defer func() {
if r := recover(); r != nil {
warmupErr = fmt.Errorf("Dynamo configuration error: %v", r)
}
}()
warmupErr = initFFI()
})
if warmupErr != nil {
return nil, fmt.Errorf("Dynamo FFI init for the Router failed: %w", warmupErr)
}
return s, nil
}
func (k *KVAwareScorer) TypedName() plugins.TypedName { return k.typedName }
// --------------------------- FFI integration ---------------------------
var (
ffiOnce sync.Once
ffiErr error
ffiNamespace string
ffiComponent string
ffiModel string
ffiOverlapScoreWeight float64
ffiRouterTemperature float64
ffiKvBlockSize uint32
ffiWorkerID int64
ffiEnforceDisagg bool
runtimeInitialized bool
// Boxed pipeline handle (owned on the Rust side, opaque here)
pipeline *C.struct_WorkerSelectionPipeline
pipelineMutex sync.RWMutex
)
func loadDynamoConfig() {
ffiNamespace = getEnvOrDefault("DYNAMO_NAMESPACE", "vllm-agg")
ffiComponent = getEnvOrDefault("DYNAMO_COMPONENT", "backend")
ffiModel = getEnvOrDefault("DYNAMO_MODEL", "Qwen/Qwen3-0.6B")
ffiWorkerID = getEnvInt64OrDefault("DYNAMO_WORKER_ID", 1)
ffiEnforceDisagg = getEnvBoolOrDefault("DYNAMO_ENFORCE_DISAGG", false)
ffiOverlapScoreWeight = getEnvFloatOrDefault("DYNAMO_OVERLAP_SCORE_WEIGHT", -1.0)
ffiRouterTemperature = getEnvFloatOrDefault("DYNAMO_ROUTER_TEMPERATURE", -1.0)
kvBlockSizeStr := os.Getenv("DYNAMO_KV_BLOCK_SIZE")
if kvBlockSizeStr == "" {
panic("DYNAMO_KV_BLOCK_SIZE is required and must match the model card's kv_cache_block_size")
}
var tmp int64
if n, err := fmt.Sscanf(kvBlockSizeStr, "%d", &tmp); err != nil || n != 1 {
panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE='%s' is not a valid integer", kvBlockSizeStr))
}
ffiKvBlockSize = uint32(tmp)
if ffiKvBlockSize < 16 || ffiKvBlockSize > 8192 {
panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d outside [16,8192]", ffiKvBlockSize))
}
if (ffiKvBlockSize & (ffiKvBlockSize - 1)) != 0 {
panic(fmt.Sprintf("DYNAMO_KV_BLOCK_SIZE=%d must be a power of 2", ffiKvBlockSize))
}
fmt.Printf("Dynamo KV Scorer: Loaded DYNAMO_KV_BLOCK_SIZE=%d\n", ffiKvBlockSize)
}
func getEnvOrDefault(key, def string) string {
if v := os.Getenv(key); v != "" {
return v
}
return def
}
func getEnvInt64OrDefault(key string, def int64) int64 {
if v := os.Getenv(key); v != "" {
var p int64
if n, err := fmt.Sscanf(v, "%d", &p); err == nil && n == 1 {
return p
}
}
return def
}
func getEnvFloatOrDefault(key string, def float64) float64 {
if v := os.Getenv(key); v != "" {
var p float64
if n, err := fmt.Sscanf(v, "%f", &p); err == nil && n == 1 {
return p
}
}
return def
}
func getEnvBoolOrDefault(key string, def bool) bool {
if v := os.Getenv(key); v != "" {
switch strings.ToLower(v) {
case "true", "1", "yes", "on":
return true
case "false", "0", "no", "off":
return false
}
}
return def
}
// initFFI: initialize runtime and create a persistent boxed pipeline.
func initFFI() error {
ffiOnce.Do(func() {
loadDynamoConfig()
ns := C.CString(ffiNamespace)
cm := C.CString(ffiComponent)
model := C.CString(ffiModel)
defer C.free(unsafe.Pointer(ns))
defer C.free(unsafe.Pointer(cm))
defer C.free(unsafe.Pointer(model))
// Init Dynamo runtime
if rc := C.dynamo_llm_init(ns, cm, C.int64_t(ffiWorkerID), C.uint32_t(ffiKvBlockSize)); rc != C.DYNAMO_OK {
ffiErr = fmt.Errorf("dynamo_llm_init failed")
return
}
runtimeInitialized = true
// Create persistent pipeline
pipelineMutex.Lock()
defer pipelineMutex.Unlock()
rc := C.dynamo_create_worker_selection_pipeline(
ns,
cm,
model,
C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_ROUTING", true)),
C.double(getEnvFloatOrDefault("DYNAMO_BUSY_THRESHOLD", -1.0)),
C.double(ffiOverlapScoreWeight),
C.double(ffiRouterTemperature),
C.bool(getEnvBoolOrDefault("DYNAMO_USE_KV_EVENTS", true)),
C.bool(getEnvBoolOrDefault("DYNAMO_ROUTER_REPLICA_SYNC", true)),
C.bool(ffiEnforceDisagg),
&pipeline,
)
if rc != C.DYNAMO_OK {
ffiErr = fmt.Errorf("dynamo_create_worker_selection_pipeline failed")
return
}
})
return ffiErr
}
// --------------------------- scoring ---------------------------
func (k *KVAwareScorer) Score(
ctx context.Context,
cycleState *schedtypes.CycleState,
req *schedtypes.LLMRequest,
pods []schedtypes.Pod,
) map[schedtypes.Pod]float64 {
logger := log.FromContext(ctx)
workerID, prefillWorkerID, tokenData, err := k.callDynamoRouter(ctx, req)
if err != nil {
logger.V(logutil.DEFAULT).Error(err, "Dynamo call failed; proceeding without worker id")
} else if workerID != "" {
logger.V(logutil.DEFAULT).Info(
"Dynamo router selected worker",
"workerID", workerID,
"prefillWorkerID", prefillWorkerID,
"tokenDataCount", len(tokenData),
)
// Store in request headers for the Lua filter at the gateway
if req.Headers == nil {
req.Headers = map[string]string{}
}
req.Headers[WorkerIDHeader] = workerID
// Set routing mode and prefill worker ID based on disaggregated vs aggregated
if prefillWorkerID != "" && prefillWorkerID != workerID {
// Disaggregated mode: separate prefill and decode workers
req.Headers[RoutingModeHeader] = "disaggregated"
req.Headers[PrefillWorkerIDHeader] = prefillWorkerID
} else {
// Aggregated mode: single worker handles both prefill and decode
req.Headers[RoutingModeHeader] = "aggregated"
}
// Store routing state for PreRequest to register with router bookkeeping.
// This is the correct place to store state - PreRequest is called AFTER
// scheduling is finalized, ensuring we only register committed requests.
if req.RequestId != "" {
routingState := &DynamoRoutingState{
WorkerID: workerID,
PrefillWorkerID: prefillWorkerID,
// TokenData is stored for future use. Currently not passed to workers
// via headers (too large). May be passed via request body in future.
TokenData: tokenData,
}
k.pluginState.Write(req.RequestId, plugins.StateKey(stateKey), routingState)
}
}
out := make(map[schedtypes.Pod]float64, len(pods))
for _, p := range pods {
out[p] = 1.0
}
return out
}
// PreRequest is called after scheduling is finalized and before the request is sent to the worker.
// This is the correct place to register the request with the Dynamo router's bookkeeping,
// as we know the request WILL be dispatched (avoiding phantom bookkeeping entries).
func (k *KVAwareScorer) PreRequest(
ctx context.Context,
request *schedtypes.LLMRequest,
schedulingResult *schedtypes.SchedulingResult,
) {
logger := log.FromContext(ctx)
if request == nil || request.RequestId == "" {
logger.V(logutil.DEBUG).Info("PreRequest: no request ID, skipping router bookkeeping")
return
}
// Read and delete the routing state stored by Score()
state, err := plugins.ReadPluginStateKey[*DynamoRoutingState](
k.pluginState, request.RequestId, plugins.StateKey(stateKey),
)
k.pluginState.Delete(request.RequestId) // Clean up state after reading
if err != nil {
// No state found means Score() didn't store routing info (e.g., router call failed)
logger.V(logutil.DEBUG).Info("PreRequest: no routing state found, skipping router bookkeeping",
"requestID", request.RequestId)
return
}
// Register request with router bookkeeping now that scheduling is committed
if addErr := k.callAddRequest(ctx, request.RequestId, state.TokenData, state.WorkerID, state.PrefillWorkerID); addErr != nil {
logger.V(logutil.DEFAULT).Error(addErr, "PreRequest: failed to add request to router bookkeeping",
"requestID", request.RequestId)
return
}
logger.V(logutil.VERBOSE).Info("PreRequest: registered request with router bookkeeping",
"requestID", request.RequestId,
"workerID", state.WorkerID,
"prefillWorkerID", state.PrefillWorkerID,
"tokenCount", len(state.TokenData),
)
}
// ResponseComplete is called after the complete response is sent to the client.
// It cleans up the router bookkeeping state for the completed request by calling
// dynamo_router_free_request to release resources associated with the request.
func (k *KVAwareScorer) ResponseComplete(
ctx context.Context,
request *schedtypes.LLMRequest,
response *rc.Response,
targetPod *backend.Pod,
) {
logger := log.FromContext(ctx)
if request == nil {
logger.V(logutil.DEBUG).Info("ResponseComplete: request is nil, skipping cleanup")
return
}
requestID := request.RequestId
if requestID == "" {
logger.V(logutil.DEBUG).Info("ResponseComplete: no request ID, skipping cleanup")
return
}
// Call the dynamo router to free the request bookkeeping
if err := callFreeRequestInternal(requestID); err != nil {
logger.V(logutil.DEFAULT).Error(err, "ResponseComplete: failed to free request",
"requestID", requestID)
return
}
logger.V(logutil.VERBOSE).Info("ResponseComplete: freed request from router",
"requestID", requestID)
}
// --------------------------- router call (persistent only) ---------------------------
func (k *KVAwareScorer) callDynamoRouter(
ctx context.Context,
req *schedtypes.LLMRequest,
) (workerID string, prefillWorkerID string, tokenData []int64, err error) {
logger := log.FromContext(ctx)
if err := initFFI(); err != nil {
logger.V(logutil.DEFAULT).Error(err, "FFI init failed")
return "", "", nil, err
}
if !runtimeInitialized {
return "", "", nil, fmt.Errorf("dynamo runtime not initialized")
}
pipelineMutex.RLock()
currentPipeline := pipeline
pipelineMutex.RUnlock()
if currentPipeline == nil {
return "", "", nil, fmt.Errorf("dynamo worker selection pipeline not created")
}
// Build OpenAI-compatible JSON request from the new LLMRequest structure
requestBody := buildOpenAIRequest(req)
requestJSON, jsonErr := json.Marshal(requestBody)
if jsonErr != nil {
logger.V(logutil.DEFAULT).Error(jsonErr, "Failed to marshal OpenAI request")
return "", "", nil, fmt.Errorf("marshal OpenAI request: %w", jsonErr)
}
cRequestJSON := C.CString(string(requestJSON))
defer C.free(unsafe.Pointer(cRequestJSON))
// Output variables
var cDecodeWorkerID C.int64_t
var cPrefillWorkerID C.int64_t
var cTokens *C.uint32_t
var cTokenCount C.size_t
var cAnnotatedJSON *C.char
// Call the worker selection pipeline
rc := C.dynamo_query_worker_selection_and_annotate(
currentPipeline,
cRequestJSON,
&cDecodeWorkerID,
&cPrefillWorkerID,
&cTokens,
&cTokenCount,
&cAnnotatedJSON,
)
if rc != C.DYNAMO_OK {
return "", "", nil, fmt.Errorf("dynamo_query_worker_selection_and_annotate failed")
}
// Copy tokens into Go memory and free C memory
count := int(uintptr(cTokenCount))
var tokens64 []int64
if count > 0 && cTokens != nil {
src := unsafe.Slice((*uint32)(unsafe.Pointer(cTokens)), count)
tokens64 = make([]int64, count)
for i := 0; i < count; i++ {
tokens64[i] = int64(src[i])
}
}
C.dynamo_free_worker_selection_result(cTokens, cTokenCount, cAnnotatedJSON)
workerIDStr := fmt.Sprintf("%d", int64(cDecodeWorkerID))
prefillWorkerIDStr := ""
// Rust returns -1 for prefill_worker_id when not in disaggregated mode
if int64(cPrefillWorkerID) >= 0 {
prefillWorkerIDStr = fmt.Sprintf("%d", int64(cPrefillWorkerID))
}
logger.V(logutil.DEFAULT).Info("Worker selection completed",
"workerID", workerIDStr, "prefillWorkerID", prefillWorkerIDStr, "tokenCount", count)
return workerIDStr, prefillWorkerIDStr, tokens64, nil
}
// buildOpenAIRequest constructs an OpenAI-compatible request from the new LLMRequest structure
func buildOpenAIRequest(req *schedtypes.LLMRequest) map[string]any {
requestBody := make(map[string]any)
// Extract prompt from the new Body structure
userText := "default prompt"
if req != nil && req.Body != nil {
if req.Body.ChatCompletions != nil && len(req.Body.ChatCompletions.Messages) > 0 {
// Extract text from chat completions messages
var sb strings.Builder
for _, msg := range req.Body.ChatCompletions.Messages {
sb.WriteString(msg.Content.PlainText())
sb.WriteString(" ")
}
userText = strings.TrimSpace(sb.String())
} else if req.Body.Completions != nil && req.Body.Completions.Prompt != "" {
userText = req.Body.Completions.Prompt
}
}
requestBody["messages"] = []map[string]any{{"role": "user", "content": userText}}
if req != nil && strings.TrimSpace(req.TargetModel) != "" {
requestBody["model"] = req.TargetModel
} else {
requestBody["model"] = ffiModel
}
requestBody["max_tokens"] = 1
requestBody["temperature"] = 0.0
requestBody["stream"] = true
requestBody["nvext"] = map[string]any{
"annotations": []string{"query_instance_id"},
}
return requestBody
}
// --------------------------- router bookkeeping ---------------------------
// callAddRequest registers a request with the router's bookkeeping.
// This should be called after worker selection to track active requests.
func (k *KVAwareScorer) callAddRequest(
ctx context.Context,
requestID string,
tokenData []int64,
workerID string,
prefillWorkerID string,
) error {
logger := log.FromContext(ctx)
if !runtimeInitialized {
return fmt.Errorf("dynamo runtime not initialized")
}
pipelineMutex.RLock()
currentPipeline := pipeline
pipelineMutex.RUnlock()
if currentPipeline == nil {
return fmt.Errorf("dynamo worker selection pipeline not created")
}
// Parse worker ID (use decode worker for bookkeeping in disagg mode)
var workerIDUint uint64
if _, err := fmt.Sscanf(workerID, "%d", &workerIDUint); err != nil {
return fmt.Errorf("invalid worker ID: %s", workerID)
}
// Convert token data from int64 to uint32
tokens := make([]uint32, len(tokenData))
for i, t := range tokenData {
tokens[i] = uint32(t)
}
cRequestID := C.CString(requestID)
defer C.free(unsafe.Pointer(cRequestID))
var cTokens *C.uint32_t
if len(tokens) > 0 {
cTokens = (*C.uint32_t)(unsafe.Pointer(&tokens[0]))
}
rc := C.dynamo_router_add_request(
currentPipeline,
cRequestID,
cTokens,
C.size_t(len(tokens)),
C.uint64_t(workerIDUint),
C.uint32_t(0), // dp_rank = 0 for now
)
if rc != C.DYNAMO_OK {
return fmt.Errorf("dynamo_router_add_request failed")
}
logger.V(logutil.VERBOSE).Info("Added request to router bookkeeping",
"requestID", requestID, "workerID", workerID, "tokenCount", len(tokens))
return nil
}
// CallMarkPrefillComplete marks prefill as completed for a request.
// Exported for use by response handlers.
func CallMarkPrefillComplete(requestID string) error {
if !runtimeInitialized {
return fmt.Errorf("dynamo runtime not initialized")
}
pipelineMutex.RLock()
currentPipeline := pipeline
pipelineMutex.RUnlock()
if currentPipeline == nil {
return fmt.Errorf("dynamo worker selection pipeline not created")
}
cRequestID := C.CString(requestID)
defer C.free(unsafe.Pointer(cRequestID))
rc := C.dynamo_router_mark_prefill_complete(currentPipeline, cRequestID)
if rc != C.DYNAMO_OK {
return fmt.Errorf("dynamo_router_mark_prefill_complete failed")
}
return nil
}
// callFreeRequestInternal cleans up router state for a completed/cancelled request.
func callFreeRequestInternal(requestID string) error {
if !runtimeInitialized {
return fmt.Errorf("dynamo runtime not initialized")
}
pipelineMutex.RLock()
currentPipeline := pipeline
pipelineMutex.RUnlock()
if currentPipeline == nil {
return fmt.Errorf("dynamo worker selection pipeline not created")
}
cRequestID := C.CString(requestID)
defer C.free(unsafe.Pointer(cRequestID))
rc := C.dynamo_router_free_request(currentPipeline, cRequestID)
if rc != C.DYNAMO_OK {
return fmt.Errorf("dynamo_router_free_request failed")
}
return nil
}
// --------------------------- shutdown ---------------------------
func cleanupDynamo() error {
pipelineMutex.Lock()
defer pipelineMutex.Unlock()
if pipeline != nil {
if rc := C.dynamo_destroy_worker_selection_pipeline(pipeline); rc != C.DYNAMO_OK {
fmt.Printf("Warning: dynamo_destroy_worker_selection_pipeline failed\n")
}
pipeline = nil
}
if runtimeInitialized {
if rc := C.dynamo_llm_shutdown(); rc != C.DYNAMO_OK {
return fmt.Errorf("dynamo_llm_shutdown failed")
}
runtimeInitialized = false
}
return nil
}
......@@ -13,6 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Dynamo EPP Configuration
#
# The KV scorer sets routing headers that the Lua filter at the gateway
# reads to inject nvext into the request body:
# - x-worker-instance-id: Selected worker ID
# - x-prefiller-host-port: Prefill worker (disaggregated mode)
# - x-dynamo-routing-mode: "aggregated" or "disaggregated"
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
......@@ -22,14 +30,15 @@ plugins:
# Picker: chooses the final endpoint after scoring
- name: picker
type: max-score-picker
- name: dyn-pre
type: dynamo-inject-workerid
parameters: {}
# Dynamo KV-aware Scorer: calls Dynamo router FFI for worker selection
# Implements Scorer, PreRequest, and ResponseComplete:
# - Score: Selects workers based on KV cache, sets routing headers
# - PreRequest: Registers request with router bookkeeping
# - ResponseComplete: Frees router bookkeeping when response completes
- name: dyn-kv
type: kv-aware-scorer
parameters:
frontendURL: http://127.0.0.1:8000/v1/chat/completions
timeoutMS: 10000
schedulingProfiles:
- name: default
plugins:
......
......@@ -22,4 +22,5 @@ subjects:
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: pod-read
\ No newline at end of file
name: pod-read
apiGroup: rbac.authorization.k8s.io
\ No newline at end of file
......@@ -19,10 +19,10 @@ metadata:
rules:
# Gateway API inference resources
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencepools"]
resources: ["inferencepools", "inferenceobjectives", "inferencemodelrewrites"]
verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencemodels"]
- apiGroups: ["inference.networking.k8s.io"]
resources: ["inferencepools"]
verbs: ["get", "watch", "list"]
# Core resources for pod discovery
- apiGroups: [""]
......
......@@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- /* ------------ file-scope vars (no output) ------------ */ -}}
{{- $platformNs := default .Release.Namespace .Values.platformNamespace -}}
{{- $platformName := default "dynamo-platform" .Values.platformReleaseName -}}
......@@ -23,10 +24,10 @@
{{- $std := .Values.extension.standardImage -}}
{{- $dyn := .Values.extension.dynamoImage -}}
{{- $fallback := ternary $dyn $std .Values.epp.useDynamo -}}
{{- $eppImage := default $fallback .Values.extension.image -}}
{{- $eppImage := default $fallback .Values.extension.image }}
--- # <-- start of actual YAML document
---
# Deployment for the EPP (Endpoint Picker Plugin)
apiVersion: apps/v1
kind: Deployment
metadata:
......@@ -61,26 +62,30 @@ spec:
{{- if .Values.epp.argsOverride }}
{{- toYaml .Values.epp.argsOverride | nindent 8 }}
{{- else }}
- -poolName
- -pool-name
- "{{ .Values.model.shortName }}-pool"
- -poolNamespace
- -pool-namespace
- "{{ .Release.Namespace }}"
- -pool-group
- "inference.networking.x-k8s.io"
- -v
- "4"
- --zap-encoder
- "json"
- -grpcPort
- -grpc-port
- "9002"
- -grpcHealthPort
- -grpc-health-port
- "9003"
{{- if $useDynamo }}
- -configFile
- -config-file
- "{{ .Values.epp.configFile }}"
{{- end }}
{{- end }}
{{- if $useDynamo }}
volumeMounts:
- name: hf-cache
mountPath: /home/nonroot/.cache
{{- if $useDynamo }}
- name: epp-config
mountPath: /etc/epp
readOnly: true
......@@ -117,11 +122,21 @@ spec:
value: "true"
- name: USE_STREAMING
value: "true"
# HuggingFace token for downloading model config files
# Without this, HuggingFace rate-limits requests (429 Too Many Requests)
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
optional: true
{{- end }}
{{- range .Values.epp.extraEnv }}
- name: {{ .name }}
value: {{ .value | quote }}
{{- end }}
- name: RUST_LOG
value: "debug,dynamo_llm::kv_router=trace"
ports:
- containerPort: 9002
......@@ -141,8 +156,10 @@ spec:
initialDelaySeconds: 5
periodSeconds: 10
{{- if $useDynamo }}
volumes:
- name: hf-cache
emptyDir: {}
{{- if $useDynamo }}
- name: epp-config
configMap:
name: {{ include "dynamo-gaie.fullname" . }}-epp-config
......
......@@ -14,6 +14,8 @@
# limitations under the License.
{{- if .Values.httpRoute.enabled }}
{{- /* Default gatewayNamespace to the release namespace if not specified */ -}}
{{- $gatewayNs := default .Release.Namespace .Values.httpRoute.gatewayNamespace }}
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
......@@ -24,9 +26,10 @@ spec:
- group: gateway.networking.k8s.io
kind: Gateway
name: {{ .Values.httpRoute.gatewayName }}
namespace: {{ $gatewayNs }}
rules:
- backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: {{ .Values.model.shortName }}-pool
namespace: {{ .Release.Namespace }}
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
metadata:
name: {{ .Values.model.shortName }}-model
namespace: {{ .Release.Namespace }}
spec:
criticality: {{ .Values.model.criticality }}
modelName: {{ .Values.model.identifier }}
poolRef:
group: inference.networking.x-k8s.io
kind: InferencePool
name: {{ .Values.model.shortName }}-pool
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment