"vscode:/vscode.git/clone" did not exist on "4e6c3964d413b2695b93bd9c885a412a9682a8a5"
Unverified Commit 4810ad34 authored by atchernych's avatar atchernych Committed by GitHub
Browse files

feat: update GAIE to release version with hints in headers (#5503)


Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
parent b31b5b56
# SPDX-FileCopyrightText: Copyright The Kubernetes Authors.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modifications Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES
# Dockerfile.epp - Custom Dockerfile for GAIE EPP. This is to be used with the deploy/inference-gateway/build-epp-dynamo.sh
ARG DOCKER_PROXY
ARG BUILDER_IMAGE="golang:1.24"
ARG BASE_IMAGE="ubuntu:22.04"
############################
# Builder
############################
FROM ${DOCKER_PROXY}${BUILDER_IMAGE} AS builder
ENV CGO_ENABLED=1
# be explicit; helps cgo when linking libstdc++
ENV CC=gcc
ENV CXX=g++
# C/C++ toolchain for cgo, and libstdc++ for link-time
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
gcc g++ \
libc6-dev \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
ARG COMMIT_SHA=unknown
ARG BUILD_REF
WORKDIR /src
# deps first (cache)
COPY go.mod go.sum ./
RUN go mod download
# source
COPY cmd/epp ./cmd/epp
COPY pkg/epp ./pkg/epp
COPY internal ./internal
COPY api ./api
# sanity (optional)
RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/include/ || echo "Headers not found"
RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib/ || echo "Library not found"
# build
WORKDIR /src/cmd/epp
RUN go build \
-ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" \
-o /epp
############################
# Runtime
############################
FROM ${DOCKER_PROXY}${BASE_IMAGE} AS runtime
ARG DYNAMO_COMMIT_SHA
ENV DYNAMO_COMMIT_SHA=$DYNAMO_COMMIT_SHA
# Minimal runtime deps; include libstdc++ runtime for -lstdc++
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
libstdc++6 \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r nonroot && useradd -r -g nonroot -m -d /home/nonroot nonroot \
&& mkdir -p /home/nonroot/.cache/huggingface/hub \
&& chown -R nonroot:nonroot /home/nonroot
WORKDIR /
COPY --from=builder /epp /epp
# Set HOME so ModelExpress can find the cache directory
ENV HOME=/home/nonroot
USER nonroot:nonroot
ENTRYPOINT ["/epp"]
......@@ -199,8 +199,8 @@ The frontend image is a specialized container that includes the Dynamo component
```
The build process automatically:
1. Clones the Gateway API Inference Extension (GAIE) repository
2. Builds the custom EPP image with Dynamo routing capabilities
1. Builds the Dynamo static library for EPP KV-aware routing
2. Builds the custom EPP Docker image using `make all` from `deploy/inference-gateway/epp/Makefile`
3. Builds the frontend image with the EPP binary and Dynamo runtime components
For more details, see [`deploy/inference-gateway/README.md`](../deploy/inference-gateway/README.md).
......
......@@ -138,10 +138,6 @@ SGLANG_CUDA_VERSION="12.9.1"
SGLANG_CUDA_VERSION_CU13="13.0.1"
SGLANG_RUNTIME_IMAGE_TAG_CU13="v0.5.7-cu130-runtime"
# GAIE (Gateway API Inference Extension) configuration for frontend (required for EPP binary for frontend image)
GAIE_REPO_URL="https://github.com/kubernetes-sigs/gateway-api-inference-extension.git"
GAIE_VERSION="v0.5.1"
PYTHON_VERSION="3.12"
NIXL_REF=0.8.0
......@@ -969,39 +965,33 @@ show_image_options
# Handle FRONTEND target: build EPP image first
if [[ ${TARGET^^} == "FRONTEND" ]]; then
echo "Building FRONTEND image - requires EPP image"
# Build base dynamo image first (framework=NONE, target=dev)
echo ""
echo "Building EPP image for Frontend..."
# Set up paths for GAIE
GAIE_CLONE_DIR="${BUILD_CONTEXT}/.build/external/gateway-api-inference-extension"
echo "Building EPP image for Frontend using Makefile..."
# Clone GAIE repo
echo ""
echo "Cloning GAIE repository at ${GAIE_VERSION}..."
$RUN_PREFIX rm -rf "${GAIE_CLONE_DIR}"
$RUN_PREFIX mkdir -p "$(dirname "${GAIE_CLONE_DIR}")"
$RUN_PREFIX git clone ${GAIE_REPO_URL} "${GAIE_CLONE_DIR}"
$RUN_PREFIX cd "${GAIE_CLONE_DIR}"
$RUN_PREFIX git checkout ${GAIE_VERSION}
$RUN_PREFIX cd "${BUILD_CONTEXT}"
# Build EPP image
echo ""
echo "Building EPP image..."
export GAIE_DIR="${GAIE_CLONE_DIR}"
export DYNAMO_DIR="${BUILD_CONTEXT}"
# EPP directory with the new self-contained build
EPP_DIR="${BUILD_CONTEXT}/deploy/inference-gateway/epp"
# Set DOCKER_PROXY from ECR_HOSTNAME if available (for pulling base images through proxy)
# This prevents rate-limiting when building in CI across multiple PRs
DOCKER_PROXY_ARG=""
if [[ -n "${ECR_HOSTNAME}" ]]; then
export DOCKER_PROXY="${ECR_HOSTNAME}/dockerhub/"
DOCKER_PROXY="${ECR_HOSTNAME}/dockerhub/"
DOCKER_PROXY_ARG="DOCKER_PROXY=${DOCKER_PROXY}"
echo "Using DOCKER_PROXY: ${DOCKER_PROXY}"
fi
$RUN_PREFIX bash ${DYNAMO_DIR}/deploy/inference-gateway/build-epp-dynamo.sh
# Set EPP image tag (matches what build-epp-dynamo.sh produces)
EPP_IMAGE_TAG="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:${GAIE_VERSION}-dirty"
# Build EPP image using the Makefile
# The Makefile handles: building Dynamo library, building Docker image, loading it locally
$RUN_PREFIX make -C "${EPP_DIR}" all DYNAMO_DIR="${BUILD_CONTEXT}" ${DOCKER_PROXY_ARG}
# Compute EPP image tag (must match Makefile's IMAGE_TAG)
# IMAGE_TAG = $(IMAGE_REPO):$(GIT_TAG)
# IMAGE_REPO = $(DOCKER_SERVER)/$(IMAGE_NAME)
# Image lives in local cache only, not pushed to any registry
EPP_DOCKER_SERVER="dynamo"
EPP_IMAGE_NAME="dynamo-epp"
EPP_GIT_TAG=$(git describe --tags --dirty --always 2>/dev/null || echo "dev")
EPP_IMAGE_TAG="${EPP_DOCKER_SERVER}/${EPP_IMAGE_NAME}:${EPP_GIT_TAG}"
echo "Successfully built EPP image: ${EPP_IMAGE_TAG}"
......
## Inference Gateway Setup with Dynamo
When integrating Dynamo with the Inference Gateway you could either use the default EPP image provided by the extension or use the custom Dynamo image.
When integrating Dynamo with the Inference Gateway it is recommended to use the custom Dynamo EPP image.
1. When using the Dynamo custom EPP image you will take advantage of the Dynamo router when EPP chooses the best worker to route the request to. This setup uses a custom Dynamo plugin `dyn-kv` to pick the best worker. In this case the Dynamo routing logic is moved upstream. We recommend this approach.
1. **Dynamo EPP (Recommended):** The custom Dynamo EPP image integrates the Dynamo router directly into the gateway's endpoint picker. Using the `dyn-kv` plugin, it selects the optimal worker based on KV cache state and tokenized prompt before routing the request. The integration moves intelligent routing upstream to the gateway layer.
2. When using the GAIE-provided image for the EPP, the Dynamo deployment is treated as a black box and the EPP would route round-robin. In this case GAIE just fans out the traffic, and the smarts only remain within the Dynamo graph. Use this if you have one Dynamo graph and do not want to obtain the Dynamo EPP image. This is a "backup" approach.
2. **Standard EPP (Fallback):** You can use the default GAIE EPP image, which treats the Dynamo deployment as a black box and routes requests round-robin. Routing intelligence remains within the Dynamo graph itself. Use this approach if you have a single Dynamo graph and don't need the custom EPP image.
EPP’s default kv-routing approach is not token-aware because the prompt is not tokenized. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).
The setup provided here uses the Dynamo custom EPP by default. Set `epp.useDynamo=false` in your deployment to pick the approach 2.
EPP’s default kv-routing approach is not token-aware because the prompt is hashed without tokenization. But the Dynamo plugin uses a token-aware KV algorithm. It employs the dynamo router which implements kv routing by running your model’s tokenizer inline. The EPP plugin configuration lives in [`helm/dynamo-gaie/epp-config-dynamo.yaml`](helm/dynamo-gaie/epp-config-dynamo.yaml) per EPP [convention](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/config-text/).
Dynamo Integration with the Inference Gateway supports Aggregated and Disaggregated Serving.
If you want to use LoRA deploy Dynamo without the Inference Gateway or in the BlackBox approach with the Inference Gateway.
Currently, these setups are only supported with the kGateway based Inference Gateway.
......@@ -16,7 +19,19 @@ Currently, these setups are only supported with the kGateway based Inference Gat
- [Prerequisites](#prerequisites)
- [Installation Steps](#installation-steps)
- [Usage](#6-usage)
- [1. Install Dynamo Platform](#1-install-dynamo-platform)
- [2. Deploy Inference Gateway](#2-deploy-inference-gateway)
- [3. Deploy Your Model](#3-deploy-your-model)
- [4. Build EPP image](#4-build-epp-image)
- [5. Install Dynamo GAIE helm chart](#5-install-dynamo-gaie-helm-chart)
- [6. Verify Installation](#6-verify-installation)
- [7. Usage](#7-usage)
- [8. Deleting the installation](#8-deleting-the-installation)
- [Gateway API Inference Extension Details](#gateway-api-inference-extension-integration)
- [v1.2.1 API Changes](#v121-api-changes)
- [Building for v1.2.1](#building-for-v121)
- [Header-Only Routing for v1.2.1](#header-only-routing-for-v121)
## Prerequisites
......@@ -34,19 +49,22 @@ Currently, these setups are only supported with the kGateway based Inference Gat
First, deploy an inference gateway service. In this example, we'll install `kgateway` based gateway implementation.
```bash
./install_gaie_crd_kgateway.sh
cd deploy/inference-gateway
./scripts/install_gaie_crd_kgateway.sh
```
**Note**: The manifest at `config/manifests/gateway/kgateway/gateway.yaml` uses `gatewayClassName: agentgateway`, but kGateway's helm chart creates a GatewayClass named `kgateway`. The patch command in the script fixes this mismatch.
Verify installation:
#### f. Verify the Gateway is running
```bash
kubectl get gateway inference-gateway -n my-model
kubectl get gateway inference-gateway
# Sample output
# NAME CLASS ADDRESS PROGRAMMED AGE
# inference-gateway kgateway x.x.x.x True 1m
# inference-gateway kgateway True 1m
```
### 3. Deploy Your Model ###
Follow the steps in [model deployment](../../examples/backends/vllm/deploy/README.md) to deploy `Qwen/Qwen3-0.6B` model in aggregate mode using [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml) in `my-model` kubernetes namespace.
......@@ -54,7 +72,8 @@ Follow the steps in [model deployment](../../examples/backends/vllm/deploy/READM
Sample commands to deploy model:
```bash
cd <dynamo-source-root>/examples/backends/vllm/deploy
cd <dynamo-source-root>
cd examples/backends/vllm/deploy
kubectl apply -f agg.yaml -n my-model
```
......@@ -83,14 +102,42 @@ Create a model configuration file similar to the vllm_agg_qwen.yaml for your mod
This file demonstrates the values needed for the Vllm Agg setup in [agg.yaml](../../examples/backends/vllm/deploy/agg.yaml)
Take a note of the model's block size provided in the model card.
### 4. Install Dynamo GAIE helm chart ###
### 4. Build EPP image
You can either use the provided Dynamo FrontEnd image for the EPP image or you need to build your own Dynamo EPP custom image following the steps below.
```bash
# export env vars
export DOCKER_SERVER=ghcr.io/nvidia/dynamo # Container registry
export IMAGE_TAG=YOUR-TAG # Or auto from git tag
cd deploy/inference-gateway/epp
make all # Do everything in one command
# or make all-push to also push
# Or step-by-step
make dynamo-lib # Build Dynamo library and copy to project
make image-load # Build Docker image and load locally
make image-push # Build and push to registry
make info # Check image tag
```
#### All-in-one Targets
| Target | Description |
|--------|-------------|
| `make dynamo-lib` | Build Dynamo static library and copy to project |
| `make all` | Build Dynamo lib + Docker image + load locally |
| `make all-push` | Build Dynamo lib + Docker image + push to registry |
### 5. Install Dynamo GAIE helm chart ###
The Inference Gateway is configured through the `inference-gateway-resources.yaml` file.
Deploy the Inference Gateway resources to your Kubernetes cluster by running the command below.
```bash
cd deploy/inference-gateway
cd deploy/inference-gateway/
# Export the Dynamo image you have used when deploying your model in Step 3.
export DYNAMO_IMAGE=<the-dynamo-image-you-have-used-when-deploying-the-model>
......@@ -122,7 +169,7 @@ You can configure the plugin by setting environment vars in your [values-dynamo-
- Overwrite the `DYN_NAMESPACE` env var if needed to match your model's dynamo namespace.
- Set `DYNAMO_BUSY_THRESHOLD` to configure the upper bound on how “full” a worker can be (often derived from kv_active_blocks or other load metrics) before the router skips it. If the selected worker exceeds this value, routing falls back to the next best candidate. By default the value is negative meaning this is not enabled.
- Set `DYNAMO_ROUTER_REPLICA_SYNC=true` to enable a background watcher to keep multiple router instances in sync (important if you run more than one KV router per component).
- Set `DYNAMO_ENFORCE_DISAGG=true` if you want to enforce every request being served in the disaggregated manner. By default it is false meaning if the the prefill worker is not available the request will be served in the aggregated manner.
- By default the Dynamo plugin uses KV routing. You can expose `DYNAMO_USE_KV_ROUTING=false` in your [values-dynamo-epp.yaml] if you prefer to route in the round-robin fashion.
- If using kv-routing:
- Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-dynamo-epp.yaml](./values-dynamo-epp.yaml) to match your model's block size.The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures.
......@@ -132,52 +179,25 @@ You can configure the plugin by setting environment vars in your [values-dynamo-
- See the [KV cache routing design](../../docs/router/kv_cache_routing.md) for details.
Dynamo provides a custom routing plugin `pkg/epp/scheduling/plugins/dynamo_kv_scorer/plugin.go` to perform efficient kv routing.
The Dynamo router is built as a static library, the EPP router will call to provide fast inference.
You can either use the special FrontEnd image for the EPP_IMAGE in the Helm deployment command and proceed to the step 2 or you can build the image yourself following the steps below.
##### 1. Build the custom EPP image #####
If you choose to build your own image, use the `container/build.sh` script with the `--target frontend` option:
```bash
./container/build.sh --framework none --target frontend
```
This command automatically:
- Clones the Gateway API Inference Extension (GAIE) repository at the correct version
- Builds the Dynamo Router static library
- Applies the necessary patches to the EPP codebase
- Builds the custom EPP image with Dynamo KV routing support
- Builds the frontend image with the EPP binary and Dynamo runtime components
Re-tag the freshly built image and push it to your registry:
```bash
docker images
docker tag <your-new-id> <your-image-tag>
docker push <your-image-tag>
```
**Note**
You can also use the standard EPP image`us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.4.0`. For the basic black box integration run:
You can also use the standard EPP image i.e. `us-central1-docker.pkg.dev/k8s-artifacts-prod/images/gateway-api-inference-extension/epp:v1.2.1` for the basic black box integration.
```bash
cd deploy/inference-gateway
helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml
# Optionally export the standard EPP image if you do not want to use the default we suggest.
export EPP_IMAGE=us-central1-docker.pkg.dev/k8s-artifacts-prod/images/gateway-api-inference-extension/epp:v0.4.0
helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set epp.useDynamo=false
helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set epp.useDynamo=false --set-string extension.image=$EPP_IMAGE
# Optionally overwrite the image --set-string extension.image=$EPP_IMAGE
```
### 5. Verify Installation ###
### 6. Verify Installation ###
Check that all resources are properly deployed:
```bash
kubectl get inferencepool
kubectl get inferencemodel
kubectl get httproute
kubectl get service
kubectl get gateway
......@@ -190,16 +210,12 @@ Sample output:
NAME AGE
qwen-pool 33m
# kubectl get inferencemodel
NAME MODEL NAME INFERENCE POOL CRITICALITY AGE
qwen-model Qwen/Qwen3-0.6B qwen-pool Critical 33m
# kubectl get httproute
NAME HOSTNAMES AGE
qwen-route 33m
```
### 6. Usage ###
### 7. Usage ###
The Inference Gateway provides HTTP endpoints for model inference.
......@@ -310,11 +326,56 @@ Sample inference output:
}
```
### 7. Deleting the installation ###
### 8. Deleting the installation ###
If you need to uninstall run:
```bash
kubectl delete dynamoGraphDeployment vllm-agg
helm uninstall dynamo-gaie -n my-model
# To uninstall GAIE
# 1. Delete the inference-gateway
kubectl delete gateway inference-gateway --ignore-not-found
# 2. Uninstall kgateway helm releases
helm uninstall kgateway -n kgateway-system
helm uninstall kgateway-crds -n kgateway-system
# 3. Delete the kgateway-system namespace (optional, cleans up everything in it)
helm uninstall kgateway --namespace kgateway-system
kubectl delete namespace kgateway-system --ignore-not-found
# 4. Delete the Inference Extension CRDs
IGW_LATEST_RELEASE=v1.2.1
kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${IGW_LATEST_RELEASE}/manifests.yaml --ignore-not-found
# 5. Delete the Gateway API CRDs
GATEWAY_API_VERSION=v1.4.1
kubectl delete -f https://github.com/kubernetes-sigs/gateway-api/releases/download/$GATEWAY_API_VERSION/standard-install.yaml --ignore-not-found
```
## Gateway API Inference Extension Integration
This section documents the updated plugin implementation for Gateway API Inference Extension **v1.2.1**.
### v1.2.1 API Changes
### Building for v1.2.1
The plugin code for v1.2.1 is in:
- `pkg/plugins/dynamo_kv_scorer/plugin.go`
### Header-Only Routing for v1.2.1
In v1.2.1, the EPP uses a **header-only approach** for communicating routing decisions.
The plugins set HTTP headers that are forwarded to the backend workers.
#### Headers Set by Dynamo Plugins
| Header | Description | Set By |
|--------|-------------|--------|
| `x-worker-instance-id` | Primary worker ID (decode worker in disagg mode) | kv-aware-scorer |
| `x-prefill-instance-id` | Prefill worker ID (disaggregated mode only) | kv-aware-scorer |
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e # Exit on any error
# Configuration - Set these environment variables before running
if [[ -z "${DYNAMO_DIR}" ]]; then
echo "DYNAMO_DIR environment variable must be set"
echo " Example: export DYNAMO_DIR=/path/to/dynamo"
exit 1
fi
if [[ -z "${GAIE_DIR}" ]]; then
echo "GAIE_DIR environment variable must be set"
echo " Example: export GAIE_DIR=/path/to/gateway-api-inference-extension"
exit 1
fi
DYNAMO_LIB_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib"
DYNAMO_INCLUDE_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/include"
echo "Building Dynamo KV Router C Library..."
# Step 1: Build the static library
echo "Building static library..."
cd "${DYNAMO_DIR}"
cargo build --release -p libdynamo_llm
# Step 2: Generate header file (with fallback)
echo "Generating C header..."
HEADER_OUTPUT="${DYNAMO_DIR}/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h"
if ! cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm --output "${HEADER_OUTPUT}"; then
echo "cbindgen failed, using fallback header..."
cp "${DYNAMO_DIR}/lib/bindings/c/src/fallback_header.h" "${HEADER_OUTPUT}"
fi
# Step 3: Ensure directories exist
echo "Preparing directories..."
mkdir -p "${DYNAMO_LIB_DIR}"
mkdir -p "${DYNAMO_INCLUDE_DIR}"
# Step 4: Copy files to GAIE project
echo "Copying files to the GAIE project..."
cp "${HEADER_OUTPUT}" "${DYNAMO_INCLUDE_DIR}/"
cp "${DYNAMO_DIR}/target/release/libdynamo_llm_capi.a" "${DYNAMO_LIB_DIR}/"
cp "${DYNAMO_DIR}/container/Dockerfile.epp" "${GAIE_DIR}/Dockerfile.dynamo"
# Verify files were copied
if [[ ! -f "${DYNAMO_INCLUDE_DIR}/llm_engine.h" ]]; then
echo "Header file copy failed!"
exit 1
fi
if [[ ! -f "${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a" ]]; then
echo "Library file copy failed!"
exit 1
fi
if [[ ! -f "${GAIE_DIR}/Dockerfile.dynamo" ]]; then
echo "Docker.dynamo file copy failed!"
exit 1
fi
echo "Files copied successfully:"
echo " Header: ${DYNAMO_INCLUDE_DIR}/llm_engine.h"
echo " Library: ${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a"
echo " Docker: ${GAIE_DIR}/Dockerfile.epp"
# Step 5: Apply Dynamo patch (if it exists)
echo "Applying Dynamo patch..."
cd "${GAIE_DIR}"
PATCH_FILE="${DYNAMO_DIR}/deploy/inference-gateway/epp-patches/v0.8.0/gaie.patch"
if [[ -f "${PATCH_FILE}" ]]; then
if git apply --check "${PATCH_FILE}" 2>/dev/null; then
git apply "${PATCH_FILE}"
echo "Patch applied successfully"
else
echo "Patch doesn't apply cleanly - may already be applied or need manual resolution"
fi
else
echo "No patch file found at ${PATCH_FILE}"
fi
# Step 6: Build the EPP image
echo "Building the custom EPP image for GAIE..."
# Build make args - pass DOCKER_PROXY if set (e.g., from ECR_HOSTNAME)
MAKE_ARGS=""
if [[ -n "${DOCKER_PROXY}" ]]; then
echo "Using DOCKER_PROXY: ${DOCKER_PROXY}"
MAKE_ARGS+="DOCKER_PROXY=${DOCKER_PROXY} "
fi
make ${MAKE_ARGS} dynamo-image-local-load
echo "EPP image with Dynamo KV routing built"
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright The Kubernetes Authors.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modifications Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES
# Dynamo EPP Dockerfile
# Builds a custom EPP image with Dynamo KV-aware routing plugins
#
# PREREQUISITES: Run `make dynamo-lib` before building this image to ensure
# the Dynamo FFI library and headers are in place.
ARG BUILDER_IMAGE=golang:1.24-bookworm
ARG BASE_IMAGE=ubuntu:24.04
# =============================================================================
# Build stage
# =============================================================================
FROM ${BUILDER_IMAGE} AS builder
# Docker buildx provides these automatically for multi-platform builds
ARG TARGETOS=linux
ARG TARGETARCH
ARG COMMIT_SHA
ARG BUILD_REF
WORKDIR /workspace
# Install build dependencies for CGO
RUN apt-get update && apt-get install -y \
gcc \
g++ \
libc-dev \
&& rm -rf /var/lib/apt/lists/*
# Copy go mod files first for better caching
COPY go.mod go.sum ./
RUN go mod download
# Copy the source code (including pre-built Dynamo library)
COPY . .
# Verify Dynamo library exists
RUN if [ ! -f "pkg/plugins/dynamo_kv_scorer/lib/libdynamo_llm_capi.a" ]; then \
echo "ERROR: Dynamo library not found!"; \
echo "Run 'make dynamo-lib' before building the Docker image."; \
exit 1; \
fi
# Build with CGO enabled for the Dynamo FFI
# Use TARGETOS/TARGETARCH from Docker buildx for proper platform support
RUN CGO_ENABLED=1 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build \
-ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.GitVersion=${BUILD_REF} \
-X sigs.k8s.io/gateway-api-inference-extension/version.GitCommit=${COMMIT_SHA}" \
-o epp ./cmd/epp
# =============================================================================
# Runtime stage
# =============================================================================
FROM ${BASE_IMAGE}
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
ca-certificates \
libstdc++6 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /
# Copy the binary from builder
COPY --from=builder /workspace/epp .
# Note: EPP config is mounted via Kubernetes ConfigMap at runtime
# See helm/dynamo-gaie/templates/epp-configmap.yaml
# Create non-root user
RUN useradd -r -u 65532 -g nogroup nonroot
USER 65532:65534
ENTRYPOINT ["/epp"]
# Dynamo EPP Makefile
# Builds custom EPP image with Dynamo KV-aware routing plugins
# Image configuration
# Image lives in local cache only, not pushed to any registry
DOCKER_SERVER ?= dynamo
IMAGE_NAME := dynamo-epp
GIT_COMMIT_SHA ?= $(shell git rev-parse HEAD 2>/dev/null || echo "unknown")
GIT_TAG ?= $(shell git describe --tags --dirty --always 2>/dev/null || echo "dev")
IMAGE_REPO ?= $(DOCKER_SERVER)/$(IMAGE_NAME)
IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG)
# Build configuration
# Auto-detect host architecture for consistent builds with Dynamo library
# The Dynamo library is built for the host arch, so Docker must match
HOST_ARCH := $(shell uname -m)
ifeq ($(HOST_ARCH),x86_64)
PLATFORMS ?= linux/amd64
else ifeq ($(HOST_ARCH),aarch64)
PLATFORMS ?= linux/arm64
else ifeq ($(HOST_ARCH),arm64)
PLATFORMS ?= linux/arm64
else
PLATFORMS ?= linux/amd64
endif
# Docker proxy for avoiding rate limits (e.g., ECR mirror)
# Set DOCKER_PROXY to prefix base images, e.g., DOCKER_PROXY=my-registry.com/dockerhub/
DOCKER_PROXY ?=
DOCKER_BUILDX_CMD ?= docker buildx
IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
BUILDER_IMAGE ?= $(DOCKER_PROXY)golang:1.24
BASE_IMAGE ?= $(DOCKER_PROXY)ubuntu:24.04
# Container tool
CONTAINER_TOOL ?= docker
# Kind cluster name for local testing
KIND_CLUSTER ?= kind
# Project directory
PROJECT_DIR := $(shell pwd)
# Dynamo directories
# Default: assume we're in dynamo/deploy/inference-gateway/epp
DYNAMO_DIR ?= $(shell cd $(PROJECT_DIR)/../../.. && pwd)
DYNAMO_LIB_DIR := $(PROJECT_DIR)/pkg/plugins/dynamo_kv_scorer/lib
DYNAMO_INCLUDE_DIR := $(PROJECT_DIR)/pkg/plugins/dynamo_kv_scorer/include
.PHONY: help
help: ## Display this help
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
##@ Development
.PHONY: fmt
fmt: ## Run go fmt
go fmt ./...
.PHONY: vet
vet: ## Run go vet
go vet ./...
.PHONY: tidy
tidy: ## Run go mod tidy
go mod tidy
.PHONY: test
test: ## Run tests
CGO_ENABLED=1 go test ./... -v
##@ Build
.PHONY: build
build: dynamo-lib-check ## Build the EPP binary locally (requires CGO and Dynamo libraries)
CGO_ENABLED=1 go build -o bin/epp ./cmd/epp
.PHONY: build-with-lib
build-with-lib: dynamo-lib build ## Build Dynamo library and EPP binary
.PHONY: image-build
image-build: dynamo-lib-check ## Build the Docker image using buildx
$(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \
--platform=$(PLATFORMS) \
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
--build-arg COMMIT_SHA=$(GIT_COMMIT_SHA) \
--build-arg BUILD_REF=$(GIT_TAG) \
$(PUSH) \
$(LOAD) \
.
.PHONY: image-push
image-push: PUSH=--push ## Build and push the Docker image
image-push: image-build
.PHONY: image-load
image-load: LOAD=--load ## Build and load the Docker image locally
image-load: image-build
.PHONY: image-kind
image-kind: image-load ## Build and load the image into kind cluster
kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER)
##@ Local Development with Buildx
.PHONY: image-local-build
image-local-build: ## Build image using a new buildx builder
BUILDER=$$($(DOCKER_BUILDX_CMD) create --use) && \
$(MAKE) image-build PUSH=$(PUSH) LOAD=$(LOAD) && \
$(DOCKER_BUILDX_CMD) rm $$BUILDER
.PHONY: image-local-push
image-local-push: PUSH=--push ## Build and push using local buildx builder
image-local-push: image-local-build
.PHONY: image-local-load
image-local-load: LOAD=--load ## Build and load using local buildx builder
image-local-load: image-local-build
##@ Dynamo Library Build
.PHONY: dynamo-lib
dynamo-lib: ## Build Dynamo static library and copy to project
@echo "Building Dynamo static library..."
cd "$(DYNAMO_DIR)" && cargo build --release -p libdynamo_llm
@echo "Generating C header..."
@mkdir -p "$(DYNAMO_DIR)/lib/bindings/c/include/nvidia/dynamo_llm"
cd "$(DYNAMO_DIR)" && \
(cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm \
--output lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h || \
cp lib/bindings/c/src/fallback_header.h lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h)
@echo "Copying files to EPP project..."
@mkdir -p "$(DYNAMO_LIB_DIR)"
@mkdir -p "$(DYNAMO_INCLUDE_DIR)"
cp "$(DYNAMO_DIR)/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h" "$(DYNAMO_INCLUDE_DIR)/"
cp "$(DYNAMO_DIR)/target/release/libdynamo_llm_capi.a" "$(DYNAMO_LIB_DIR)/"
@echo "Dynamo library ready!"
.PHONY: dynamo-lib-check
dynamo-lib-check: ## Check if Dynamo library files exist
@if [ ! -f "$(DYNAMO_LIB_DIR)/libdynamo_llm_capi.a" ]; then \
echo "ERROR: Dynamo library not found. Run 'make dynamo-lib' first."; \
exit 1; \
fi
@if [ ! -f "$(DYNAMO_INCLUDE_DIR)/llm_engine.h" ]; then \
echo "ERROR: Dynamo header not found. Run 'make dynamo-lib' first."; \
exit 1; \
fi
@echo "Dynamo library files found."
##@ Clean
.PHONY: clean
clean: ## Clean build artifacts
rm -rf bin/
go clean
##@ All-in-one Build
.PHONY: all
all: dynamo-lib image-local-load ## Build Dynamo lib and Docker image, load locally
.PHONY: all-push
all-push: dynamo-lib image-push ## Build Dynamo lib and Docker image, push to registry
.PHONY: all-kind
all-kind: dynamo-lib image-kind ## Build Dynamo lib and Docker image, load to kind
##@ Info
.PHONY: info
info: ## Show build info
@echo "Image Tag: $(IMAGE_TAG)"
@echo "Git Commit: $(GIT_COMMIT_SHA)"
@echo "Git Tag: $(GIT_TAG)"
@echo "Platforms: $(PLATFORMS)"
@echo "Docker Proxy: $(DOCKER_PROXY)"
@echo "Builder Image: $(BUILDER_IMAGE)"
@echo "Base Image: $(BASE_IMAGE)"
@echo "Dynamo Dir: $(DYNAMO_DIR)"
@echo "Dynamo Lib Dir: $(DYNAMO_LIB_DIR)"
@echo "Dynamo Include Dir: $(DYNAMO_INCLUDE_DIR)"
/*
Copyright 2025 NVIDIA Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Dynamo EPP - Custom Endpoint Picker Plugin for NVIDIA Dynamo
//
// This EPP integrates with the Gateway API Inference Extension to provide
// KV-aware routing for Dynamo inference backends.
//
// # Header-Based Routing
//
// The Dynamo KV scorer sets routing headers that the Lua filter at the
// gateway uses to inject nvext into the request body:
//
// - x-worker-instance-id: Selected worker ID (decode worker in disagg mode)
// - x-prefiller-host-port: Prefill worker ID (disaggregated mode only)
// - x-dynamo-routing-mode: "aggregated" or "disaggregated"
//
// The Lua filter reads these headers and injects:
// - Aggregated: {"nvext": {"backend_instance_id": <worker_id>}}
// - Disaggregated: {"nvext": {"prefill_worker_id": <prefill>, "decode_worker_id": <decode>}}
package main
import (
"os"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
// Dynamo plugins
dynscorer "github.com/nvidia/dynamo/deploy/inference-gateway/pkg/plugins/dynamo_kv_scorer"
)
func main() {
// Register Dynamo custom plugins:
// - kv-aware-scorer: Implements Scorer, PreRequest, and ResponseComplete interfaces
// - Score: Calls Dynamo router to select workers based on KV cache, sets routing headers
// - PreRequest: Registers request with router bookkeeping after scheduling is finalized
// - ResponseComplete: Cleans up router bookkeeping when response completes
plugins.Register("kv-aware-scorer", dynscorer.KVAwareScorerFactory)
// Run using standard GAIE runner (it registers built-in plugins automatically)
if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
os.Exit(1)
}
}
module github.com/nvidia/dynamo/deploy/inference-gateway
go 1.24.0
require (
sigs.k8s.io/controller-runtime v0.22.4
sigs.k8s.io/gateway-api-inference-extension v1.2.1
)
require (
cel.dev/expr v0.24.0 // indirect
github.com/antlr4-go/antlr/v4 v4.13.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/dennwc/varint v1.0.0 // indirect
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect
github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.21.2 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/cel-go v0.26.0 // indirect
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.4 // indirect
github.com/prometheus/procfs v0.17.0 // indirect
github.com/prometheus/prometheus v0.308.1 // indirect
github.com/spf13/cobra v1.9.1 // indirect
github.com/spf13/pflag v1.0.10 // indirect
github.com/stoewer/go-strcase v1.3.0 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
go.opentelemetry.io/otel v1.39.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.39.0 // indirect
go.opentelemetry.io/otel/metric v1.39.0 // indirect
go.opentelemetry.io/otel/sdk v1.39.0 // indirect
go.opentelemetry.io/otel/trace v1.39.0 // indirect
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.1 // indirect
go.yaml.in/yaml/v2 v2.4.3 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/exp v0.0.0-20250808145144-a408d31f581a // indirect
golang.org/x/net v0.47.0 // indirect
golang.org/x/oauth2 v0.32.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.39.0 // indirect
golang.org/x/term v0.37.0 // indirect
golang.org/x/text v0.31.0 // indirect
golang.org/x/time v0.13.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect
google.golang.org/grpc v1.78.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/api v0.34.3 // indirect
k8s.io/apiextensions-apiserver v0.34.3 // indirect
k8s.io/apimachinery v0.34.3 // indirect
k8s.io/apiserver v0.34.3 // indirect
k8s.io/client-go v0.34.3 // indirect
k8s.io/component-base v0.34.3 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d // indirect
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
sigs.k8s.io/randfill v1.0.0 // indirect
sigs.k8s.io/structured-merge-diff/v6 v6.3.1 // indirect
sigs.k8s.io/yaml v1.6.0 // indirect
)
// NOTE: For local development, uncomment the replace directive below.
// For Docker builds, keep it commented out to use the published v1.2.1 release.
// replace sigs.k8s.io/gateway-api-inference-extension => ../../../gaie_latest/gateway-api-inference-extension
This diff is collapsed.
......@@ -13,6 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Dynamo EPP Configuration
#
# The KV scorer sets routing headers that the Lua filter at the gateway
# reads to inject nvext into the request body:
# - x-worker-instance-id: Selected worker ID
# - x-prefiller-host-port: Prefill worker (disaggregated mode)
# - x-dynamo-routing-mode: "aggregated" or "disaggregated"
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
......@@ -22,14 +30,15 @@ plugins:
# Picker: chooses the final endpoint after scoring
- name: picker
type: max-score-picker
- name: dyn-pre
type: dynamo-inject-workerid
parameters: {}
# Dynamo KV-aware Scorer: calls Dynamo router FFI for worker selection
# Implements Scorer, PreRequest, and ResponseComplete:
# - Score: Selects workers based on KV cache, sets routing headers
# - PreRequest: Registers request with router bookkeeping
# - ResponseComplete: Frees router bookkeeping when response completes
- name: dyn-kv
type: kv-aware-scorer
parameters:
frontendURL: http://127.0.0.1:8000/v1/chat/completions
timeoutMS: 10000
schedulingProfiles:
- name: default
plugins:
......
......@@ -22,4 +22,5 @@ subjects:
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: pod-read
\ No newline at end of file
name: pod-read
apiGroup: rbac.authorization.k8s.io
\ No newline at end of file
......@@ -19,10 +19,10 @@ metadata:
rules:
# Gateway API inference resources
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencepools"]
resources: ["inferencepools", "inferenceobjectives", "inferencemodelrewrites"]
verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencemodels"]
- apiGroups: ["inference.networking.k8s.io"]
resources: ["inferencepools"]
verbs: ["get", "watch", "list"]
# Core resources for pod discovery
- apiGroups: [""]
......
......@@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- /* ------------ file-scope vars (no output) ------------ */ -}}
{{- $platformNs := default .Release.Namespace .Values.platformNamespace -}}
{{- $platformName := default "dynamo-platform" .Values.platformReleaseName -}}
......@@ -23,10 +24,10 @@
{{- $std := .Values.extension.standardImage -}}
{{- $dyn := .Values.extension.dynamoImage -}}
{{- $fallback := ternary $dyn $std .Values.epp.useDynamo -}}
{{- $eppImage := default $fallback .Values.extension.image -}}
{{- $eppImage := default $fallback .Values.extension.image }}
--- # <-- start of actual YAML document
---
# Deployment for the EPP (Endpoint Picker Plugin)
apiVersion: apps/v1
kind: Deployment
metadata:
......@@ -61,26 +62,30 @@ spec:
{{- if .Values.epp.argsOverride }}
{{- toYaml .Values.epp.argsOverride | nindent 8 }}
{{- else }}
- -poolName
- -pool-name
- "{{ .Values.model.shortName }}-pool"
- -poolNamespace
- -pool-namespace
- "{{ .Release.Namespace }}"
- -pool-group
- "inference.networking.x-k8s.io"
- -v
- "4"
- --zap-encoder
- "json"
- -grpcPort
- -grpc-port
- "9002"
- -grpcHealthPort
- -grpc-health-port
- "9003"
{{- if $useDynamo }}
- -configFile
- -config-file
- "{{ .Values.epp.configFile }}"
{{- end }}
{{- end }}
{{- if $useDynamo }}
volumeMounts:
- name: hf-cache
mountPath: /home/nonroot/.cache
{{- if $useDynamo }}
- name: epp-config
mountPath: /etc/epp
readOnly: true
......@@ -117,11 +122,21 @@ spec:
value: "true"
- name: USE_STREAMING
value: "true"
# HuggingFace token for downloading model config files
# Without this, HuggingFace rate-limits requests (429 Too Many Requests)
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
optional: true
{{- end }}
{{- range .Values.epp.extraEnv }}
- name: {{ .name }}
value: {{ .value | quote }}
{{- end }}
- name: RUST_LOG
value: "debug,dynamo_llm::kv_router=trace"
ports:
- containerPort: 9002
......@@ -141,8 +156,10 @@ spec:
initialDelaySeconds: 5
periodSeconds: 10
{{- if $useDynamo }}
volumes:
- name: hf-cache
emptyDir: {}
{{- if $useDynamo }}
- name: epp-config
configMap:
name: {{ include "dynamo-gaie.fullname" . }}-epp-config
......
......@@ -14,6 +14,8 @@
# limitations under the License.
{{- if .Values.httpRoute.enabled }}
{{- /* Default gatewayNamespace to the release namespace if not specified */ -}}
{{- $gatewayNs := default .Release.Namespace .Values.httpRoute.gatewayNamespace }}
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
......@@ -24,9 +26,10 @@ spec:
- group: gateway.networking.k8s.io
kind: Gateway
name: {{ .Values.httpRoute.gatewayName }}
namespace: {{ $gatewayNs }}
rules:
- backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: {{ .Values.model.shortName }}-pool
namespace: {{ .Release.Namespace }}
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
metadata:
name: {{ .Values.model.shortName }}-model
namespace: {{ .Release.Namespace }}
spec:
criticality: {{ .Values.model.criticality }}
modelName: {{ .Values.model.identifier }}
poolRef:
group: inference.networking.x-k8s.io
kind: InferencePool
name: {{ .Values.model.shortName }}-pool
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment