Unverified Commit e64d2f09 authored by atchernych's avatar atchernych Committed by GitHub
Browse files

feat: Streamline GAIE recipe (#3829)


Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
parent 3d7b4525
......@@ -53,7 +53,7 @@ b. Install the Inference Extension CRDs (Inference Model and Inference Pool CRDs
```bash
INFERENCE_EXTENSION_VERSION=v0.5.1
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$INFERENCE_EXTENSION_VERSION/manifests.yaml -n my-model
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$INFERENCE_EXTENSION_VERSION/manifests.yaml
```
c. Install `kgateway` CRDs and kgateway.
......
......@@ -13,14 +13,14 @@ This repository contains production-ready recipes for deploying large language m
## Available Models
| Model Family | Framework | Deployment Mode | GPU Requirements | Status | Benchmark |
|-----------------|-----------|---------------------|------------------|--------|-----------|
| llama-3-70b | vllm | agg | 4x H100/H200 | ✅ | ✅ |
| llama-3-70b | vllm | disagg (1 node) | 8x H100/H200 | ✅ | ✅ |
| llama-3-70b | vllm | disagg (multi-node) | 16x H100/H200 | ✅ | ✅ |
| deepseek-r1 | sglang | disagg (1 node, wide-ep) | 8x H200 | ✅ | 🚧 |
| deepseek-r1 | sglang | disagg (multi-node, wide-ep) | 16x H200 | ✅ | 🚧 |
| gpt-oss-120b | trtllm | agg | 4x GB200 | ✅ | ✅ |
| Model Family | Framework | Deployment Mode | GPU Requirements | Status | Benchmark |GAIE-integration |
|-----------------|-----------|---------------------|------------------|--------|-----------|------------------|
| llama-3-70b | vllm | agg | 4x H100/H200 | ✅ | ✅ |✅ |
| llama-3-70b | vllm | disagg (1 node) | 8x H100/H200 | ✅ | ✅ | 🚧 |
| llama-3-70b | vllm | disagg (multi-node) | 16x H100/H200 | ✅ | ✅ |🚧 |
| deepseek-r1 | sglang | disagg (1 node, wide-ep) | 8x H200 | ✅ | 🚧 |🚧 |
| deepseek-r1 | sglang | disagg (multi-node, wide-ep) | 16x H200 | ✅ | 🚧 |🚧 |
| gpt-oss-120b | trtllm | agg | 4x GB200 | ✅ | ✅ |🚧 |
**Legend:**
- ✅ Functional
......@@ -89,9 +89,7 @@ vim hf_hub_secret/hf_hub_secret.yaml
kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE}
```
### 6. Configure Storage Class
Configure persistent storage for model caching:
6. Configure Storage Class
```bash
# Check available storage classes
......@@ -160,6 +158,20 @@ kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE}
./run.sh --dry-run --model llama-3-70b --framework vllm --deployment agg
```
## If deploying with Gateway API Inference extension GAIE
1. Follow [Deploy Inference Gateway Section 2](../deploy/inference-gateway/README.md#2-deploy-inference-gateway) to install GAIE.
2. Apply manifests by running a script.
```bash
# Match the block size to the cli value in your deployment file deploy.yaml: - "python3 -m dynamo.vllm ... --block-size 128"
export DYNAMO_KV_BLOCK_SIZE=128
export EPP_IMAGE=nvcr.io/you/epp:tag
# Add --gaie argument to the script i.e.:
./run.sh --model llama-3-70b --framework vllm --gaie agg
```
The script will perform gateway checks and apply the manifests.
## Option 2: Manual Deployment
......
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env bash
set -Eeuo pipefail
# ===== Namespace ensure =====
if ! kubectl get ns "$NAMESPACE" >/dev/null 2>&1; then
kubectl create namespace "$NAMESPACE"
fi
KGW_NS="${KGW_NS:-kgateway-system}"
ok() { printf "✅ %s\n" "$*"; }
fail(){ printf "❌ %s\n" "$*" >&2; exit 1; }
info(){ printf "ℹ️ %s\n" "$*"; }
need() { command -v "$1" >/dev/null 2>&1 || fail "'$1' is required"; }
need kubectl
# ===== Config (env overridable) =====
: "${NAMESPACE:=dynamo}"
# ===== Pre-flight checks =====
command -v helm >/dev/null 2>&1 || { echo "ERROR: helm not found"; exit 1; }
command -v kubectl >/dev/null 2>&1 || { echo "ERROR: kubectl not found"; exit 1; }
GATEWAY_CRDS=(
gateways.gateway.networking.k8s.io
gatewayclasses.gateway.networking.k8s.io
httproutes.gateway.networking.k8s.io
referencegrants.gateway.networking.k8s.io
)
info "Checking Gateway API CRDs…"
for c in "${GATEWAY_CRDS[@]}"; do
kubectl get crd "$c" >/dev/null 2>&1 || fail "Missing CRD: $c (run step a)"
kubectl wait --for=condition=Established "crd/$c" --timeout=60s >/dev/null || fail "CRD not Established: $c"
done
ok "Gateway API CRDs present & Established"
GAIE_CRDS=(
inferencemodels.inference.networking.x-k8s.io
inferencepools.inference.networking.x-k8s.io
)
info "Checking GAIE (Inference Extension) CRDs…"
for c in "${GAIE_CRDS[@]}"; do
kubectl get crd "$c" >/dev/null 2>&1 || fail "Missing CRD: $c (run step b install of inference extension)"
kubectl wait --for=condition=Established "crd/$c" --timeout=60s >/dev/null || fail "CRD not Established: $c"
done
ok "GAIE CRDs present & Established"
info "Checking Kgateway controller in namespace '$KGW_NS'…"
# namespace must exist
kubectl get ns "$KGW_NS" >/dev/null 2>&1 || fail "Namespace '$KGW_NS' not found (run step c Helm installs)"
# pods should be running
if ! kubectl get pods -n "$KGW_NS" -l app.kubernetes.io/name=kgateway >/dev/null 2>&1; then
# fallback label (charts sometimes label differently)
PODS=$(kubectl get pods -n "$KGW_NS" -o name | grep -E 'kgateway|gateway' || true)
[[ -z "${PODS:-}" ]] && fail "Kgateway pods not found in '$KGW_NS'"
else
PODS=$(kubectl get pods -n "$KGW_NS" -l app.kubernetes.io/name=kgateway -o name)
fi
for p in $PODS; do
kubectl wait -n "$KGW_NS" --for=condition=Ready "$p" --timeout=180s >/dev/null || fail "Pod not Ready: $p"
done
ok "Kgateway controller pods Ready"
kubectl get gateway.gateway.networking.k8s.io inference-gateway -n "$NAMESPACE" >/dev/null 2>&1 || fail "Gateway 'inference-gateway' not found in $NAMESPACE (apply step d manifest)"
ok "GAIE is installed and the gateway is up in namespace '$NAMESPACE'."
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove the namespace field if using kubectl apply -n
apiVersion: v1
kind: ConfigMap
metadata:
name: epp-config
labels:
app.kubernetes.io/name: dynamo-gaie
app.kubernetes.io/instance: llama3-70b-agg
data:
epp-config-dynamo.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
# Required: tells EPP which profile to use (even if you only have one)
- type: single-profile-handler
# Picker: chooses the final endpoint after scoring
- name: picker
type: max-score-picker
- name: dyn-pre
type: dynamo-inject-workerid
parameters: {}
- name: dyn-kv
type: kv-aware-scorer
parameters:
frontendURL: http://127.0.0.1:8000/v1/chat/completions
timeoutMS: 10000
schedulingProfiles:
- name: default
plugins:
- pluginRef: dyn-kv
weight: 1
- pluginRef: picker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: Update the namespace field below to match your deployment namespace
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama3-70b-agg-epp
labels:
app: llama3-70b-agg-epp
spec:
replicas: 1
selector:
matchLabels:
app: llama3-70b-agg-epp
template:
metadata:
labels:
app: llama3-70b-agg-epp
spec:
serviceAccountName: epp-sa
terminationGracePeriodSeconds: 130
imagePullSecrets:
- name: docker-imagepullsecret
containers:
- name: epp
image: nvcr.io/nvstaging/ai-dynamo/epp-inference-extension-dynamo:v0.6.0-1
imagePullPolicy: IfNotPresent
resources:
requests:
memory: "1Gi"
cpu: "1"
limits:
memory: "2Gi"
cpu: "2"
command: ["/bin/sh", "-c"]
args:
- >
exec /epp
-poolName "llama3-70b-agg-pool"
-poolNamespace "$POD_NAMESPACE"
-v 4 --zap-encoder json
-grpcPort 9002 -grpcHealthPort 9003
-configFile /etc/epp/epp-config-dynamo.yaml
volumeMounts:
- name: epp-config
mountPath: /etc/epp
readOnly: true
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: PLATFORM_NAMESPACE
value: "$(POD_NAMESPACE)" # set to your dynamo platform namespace if different
- name: ETCD_ENDPOINTS
value: "dynamo-platform-etcd.$(PLATFORM_NAMESPACE):2379" # update dynamo-platform to appropriate namespace
- name: NATS_SERVER
value: "nats://dynamo-platform-nats.$(PLATFORM_NAMESPACE):4222" # update dynamo-platform to appropriate namespace
- name: DYN_NAMESPACE
value: "llama3-70b-agg"
- name: DYNAMO_KV_BLOCK_SIZE
value: "128" # UPDATE to match the --block-size in your deploy.yaml engine command
- name: USE_STREAMING
value: "true"
ports:
- containerPort: 9002
- containerPort: 9003
- name: metrics
containerPort: 9090
livenessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: epp-config
configMap:
name: epp-config
items:
- key: epp-config-dynamo.yaml
path: epp-config-dynamo.yaml
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove metadata.namespace if using kubectl apply -n
# The backendRefs.namespace field should match where your InferencePool is deployed
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: llama3-70b-agg-route
spec:
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: inference-gateway
rules:
- backendRefs:
- group: inference.networking.x-k8s.io
kind: InferencePool
name: llama3-70b-agg-pool
port: 8000
weight: 1
matches:
- path:
type: PathPrefix
value: /
timeouts:
request: 300s
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: Update the namespace field below to match your deployment namespace
apiVersion: v1
kind: Service
metadata:
name: llama3-70b-agg-epp
spec:
selector:
app: llama3-70b-agg
ports:
- protocol: TCP
port: 9002
targetPort: 9002
appProtocol: http2
type: ClusterIP
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove the namespace field if using kubectl apply -n
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
metadata:
name: llama3-70b-agg-model
spec:
criticality: Critical
modelName: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
poolRef:
group: inference.networking.x-k8s.io
kind: InferencePool
name: llama3-70b-agg-pool
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove the namespace field if using kubectl apply -n
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
metadata:
name: llama3-70b-agg-pool
spec:
targetPortNumber: 8000
selector:
nvidia.com/dynamo-component: Frontend
nvidia.com/dynamo-namespace: llama3-70b-agg # # This is the Dynamo namespace where the model is deployed
extensionRef:
failureMode: FailOpen
group: ""
kind: Service
name: llama3-70b-agg-epp
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read
rules:
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencepools"]
verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencemodels"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: ClusterRoleBinding is cluster-scoped (no metadata.namespace)
# The subjects.namespace field specifies where the ServiceAccount is located
# This CANNOT be removed - it must match your deployment namespace
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: pod-read-binding
# no metadata.namespace - kubectl -n sets it
subjects:
- kind: ServiceAccount
name: epp-sa
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: pod-read
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: ServiceAccount
metadata:
name: epp-sa
# no metadata.namespace (kubectl -n sets it)
......@@ -14,9 +14,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
set -euo pipefail
IFS=$'\n\t'
RECIPES_DIR="$( cd "$( dirname "$0" )" && pwd )"
# Default values
NAMESPACE="${NAMESPACE:-dynamo}"
DEPLOY_TYPE=""
GAIE="${GAIE:-false}"
DEPLOYMENT=""
MODEL=""
FRAMEWORK=""
......@@ -38,6 +43,7 @@ usage() {
echo "Optional:"
echo " --namespace <ns> Kubernetes namespace (default: dynamo)"
echo " --dry-run Print commands without executing them"
echo " --gaie[=true|false] Enable GAIE integration subfolder (applies GAIE manifests skips benchmark) (default: ${GAIE})"
echo " -h, --help Show this help message"
echo ""
echo "Environment Variables:"
......@@ -98,6 +104,22 @@ while [[ $# -gt 0 ]]; do
missing_requirement "$1"
fi
;;
--gaie)
GAIE=true
shift
;;
--gaie=false)
GAIE=false
shift
;;
--gaie=*)
GAIE="${1#*=}"
case "${GAIE,,}" in
true|false) GAIE="${GAIE,,}";;
*) echo "ERROR: --gaie must be true or false"; exit 1;;
esac
shift
;;
-h|--help)
usage
;;
......@@ -137,6 +159,7 @@ fi
MODEL_DIR="$RECIPES_DIR/$MODEL"
FRAMEWORK_DIR="$MODEL_DIR/${FRAMEWORK,,}"
DEPLOY_PATH="$FRAMEWORK_DIR/$DEPLOYMENT"
INTEGRATION="$([[ "${GAIE,,}" == "true" ]] && echo gaie || echo "")"
# Check if model directory exists
if [[ ! -d "$MODEL_DIR" ]]; then
......@@ -188,6 +211,7 @@ echo "Model: $MODEL"
echo "Framework: ${FRAMEWORK,,}"
echo "Deployment Type: $DEPLOYMENT"
echo "Namespace: $NAMESPACE"
echo "GAIE integration: $GAIE"
echo "======================================"
# Handle model downloading
......@@ -205,6 +229,15 @@ $DRY_RUN kubectl wait --for=condition=Complete job/$MODEL_DOWNLOAD_JOB_NAME -n $
echo "Deploying $MODEL ${FRAMEWORK,,} $DEPLOYMENT configuration..."
$DRY_RUN kubectl apply -n $NAMESPACE -f $DEPLOY_FILE
if [[ "$INTEGRATION" == "gaie" ]]; then
# run gaie checks.
SCRIPT_DIR="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
"${SCRIPT_DIR}/gaie_checks.sh"
kubectl apply -f "$DEPLOY_PATH/gaie/k8s-manifests" -n "$NAMESPACE"
# For now do not run the benchmark
exit
fi
# Launch the benchmark job (if available)
if [[ "$PERF_AVAILABLE" == "true" ]]; then
echo "Launching benchmark job..."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment