"vscode:/vscode.git/clone" did not exist on "2dbfed896a5ab929caabbc3355a0c7b70d0723fb"
Unverified Commit 2d602853 authored by atchernych's avatar atchernych Committed by GitHub
Browse files

feat: Add EPP startup probe and adjust recipe - fixes [DEP-749] (#5770)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
Signed-off-by: default avatarAnna Tchernych <atchernych@nvidia.com>
Co-authored-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 07d57894
......@@ -146,6 +146,18 @@ kubectl apply -f operator-managed/examples/agg.yaml -n ${NAMESPACE}
kubectl apply -f operator-managed/examples/http-route.yaml -n ${NAMESPACE}
```
**Startup Probe Timeout:** The EPP has a default startup probe timeout of 30 minutes (10s × 180 failures).
If your model takes longer to load, increase the `failureThreshold` in the EPP's `startupProbe`. For example,
to allow 60 minutes for startup:
```yaml
extraPodSpec:
mainContainer:
startupProbe:
failureThreshold: 360 # 10s × 360 = 60 minutes
```
**Gateway Namespace**
Note that this assumes your gateway is installed into `NAMESPACE=my-model` (examples' default)
If you installed it into a different namespace, you need to adjust the HttpRoute entry in http-route.yaml.
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: vllm-agg-route
spec:
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: inference-gateway
# Note: This assumes your gateway is installed into the same namespace as this HTTPRoute.
# If you installed it into a different namespace, add: namespace: <your-gateway-namespace>
rules:
- backendRefs:
- group: inference.networking.k8s.io
kind: InferencePool
name: vllm-agg-pool
port: 8000
weight: 1
matches:
- path:
type: PathPrefix
value: /
timeouts:
request: 300s
\ No newline at end of file
......@@ -71,17 +71,26 @@ func (e *EPPDefaults) GetBaseContainer(context ComponentContext) (corev1.Contain
PeriodSeconds: 10,
}
// Startup probe allows long initialization while waiting for workers to register.
// EPP waits indefinitely for discovery to find workers, so this probe is the
// only timeout mechanism. Default: 30 minutes (10s × 180 = 1800s).
container.StartupProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
GRPC: &corev1.GRPCAction{
Port: 9003,
Service: ptr.To("inference-extension"),
},
},
PeriodSeconds: 10,
FailureThreshold: 180,
}
// EPP-specific environment variables
container.Env = append(container.Env, []corev1.EnvVar{
{
Name: "DYN_KV_BLOCK_SIZE",
Value: "16",
},
{
// DYN_DISCOVERY_TIMEOUT_SEC is how long to wait for workers to register (in seconds)
Name: "DYN_DISCOVERY_TIMEOUT_SEC",
Value: "300",
},
{
Name: "USE_STREAMING",
Value: "true",
......
......@@ -57,47 +57,30 @@ pub enum DynamoLlmResult {
ERR = 1,
}
/// Default timeout for discovery sync (seconds).
const DEFAULT_DISCOVERY_TIMEOUT_SEC: u64 = 10;
/// Get discovery timeout from environment variable or use default.
/// Reads DYN_DISCOVERY_TIMEOUT_SEC env var (in seconds).
fn get_discovery_timeout_secs() -> u64 {
std::env::var("DYN_DISCOVERY_TIMEOUT_SEC")
.ok()
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(DEFAULT_DISCOVERY_TIMEOUT_SEC)
}
/// Wait for the discovery daemon to sync and return at least one instance.
/// This ensures list() calls will have data available.
/// Returns the number of instances found, or 0 if timed out.
async fn wait_for_discovery_sync(drt: &DistributedRuntime, timeout_secs: u64) -> usize {
tracing::info!("Waiting for discovery to sync...");
// Wait for the discovery daemon to sync indefinitely and return at least one instance.
// This is because the Model info is registered by workers and it may take up to 30 min for the model weights to load and for the worker to register itself.
// The waiting timeout is implemented in the Kubernetes StartupProbe. The EPP waiting loops runs indefinitely, the Probe is a single source of truth with when to kill the EPP if discovery fails.
// If workers are not found within the probe's failureThreshold × periodSeconds, the pod will be killed and restarted.
// Users can adjust the StartupProbe waiting timed in the DGD for large models.
async fn wait_for_discovery_sync(drt: &DistributedRuntime) -> usize {
tracing::info!(
"Waiting for discovery to sync (no timeout - controlled by K8s StartupProbe)..."
);
let discovery = drt.discovery();
let timeout = std::time::Duration::from_secs(timeout_secs);
let start = std::time::Instant::now();
loop {
match discovery.list(DiscoveryQuery::AllModels).await {
Ok(instances) if !instances.is_empty() => {
tracing::info!(
"Discovery sync complete: found {} instances",
instances.len()
);
return instances.len();
}
Ok(_) => {
if start.elapsed() > timeout {
tracing::warn!("Discovery sync timed out waiting for instances");
return 0;
}
tracing::debug!("No instances yet, waiting...");
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
Err(e) => {
tracing::warn!("Discovery list error: {}, continuing...", e);
return 0;
// Log and continue - transient errors shouldn't stop the wait
tracing::warn!("Discovery list error: {}, retrying...", e);
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
}
}
......@@ -128,18 +111,12 @@ pub unsafe extern "C" fn dynamo_llm_init(
.await
{
Ok(drt) => {
// Wait for discovery to sync before returning
// Wait for discovery to sync before returning.
// This is needed because dynamo_create_worker_selection_pipeline() is called
// immediately after, and it needs discovery.list() to return data
// the discovery daemon takes time to query K8s and returns async, so we need to wait.
let timeout_secs = get_discovery_timeout_secs();
let instance_count = wait_for_discovery_sync(drt, timeout_secs).await;
if instance_count == 0 {
tracing::error!(
"Discovery sync failed: no worker instances found. Is the backend running?"
);
return Err(DynamoLlmResult::ERR);
}
// immediately after, and it needs discovery.list() to return data.
// The discovery daemon takes time to query K8s and returns async, so we need to wait.
// Note: This waits indefinitely - the K8s StartupProbe is the timeout mechanism.
wait_for_discovery_sync(drt).await;
Ok(())
}
Err(e) => {
......@@ -1374,14 +1351,9 @@ pub async fn create_worker_selection_pipeline_chat(
// Only wait for discovery sync if we just initialized the DRT
// (dynamo_llm_init already does this when it initializes)
// Note: This waits indefinitely - the K8s StartupProbe is the timeout mechanism.
if needs_sync {
let timeout_secs = get_discovery_timeout_secs();
let instance_count = wait_for_discovery_sync(distributed_runtime, timeout_secs).await;
if instance_count == 0 {
return Err(anyhow::anyhow!(
"Discovery sync failed: no worker instances found. Is the backend running?"
));
}
wait_for_discovery_sync(distributed_runtime).await;
}
let component = distributed_runtime
......
......@@ -64,4 +64,4 @@ curl http://localhost:8000/v1/chat/completions \
- Update `storageClassName` in `model-cache/model-cache.yaml` to match your cluster before deploying
- Model download takes approximately 15-30 minutes depending on network speed
- For GAIE (Gateway API Inference Extension) integration, see [vllm/agg/gaie/](vllm/agg/gaie/)
- For GAIE (Gateway API Inference Extension) integration, `kubectl apply` the files from the corresponding subfolder i.e. [vllm/agg/gaie/](vllm/agg/gaie/)
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-agg
name: llama3-70b-agg
spec:
backendFramework: vllm
pvcs:
- name: model-cache
create: false
services:
Epp:
envFromSecret: hf-token-secret
......@@ -13,14 +16,12 @@ spec:
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/frontend-image:my-tag
image: nvcr.io/nvidia/ai-dynamo/frontend:0.8.0
env:
- name: DYN_KV_BLOCK_SIZE
value: "16"
value: "128"
- name: DYN_MODEL
value: "Qwen/Qwen3-0.6B" # Match your model
- name: DYN_DISCOVERY_TIMEOUT
value: "300"
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
eppConfig:
# This configuration uses Dynamo's KV-aware scorer for intelligent routing
config:
......@@ -43,35 +44,46 @@ spec:
Frontend:
envFromSecret: hf-token-secret
componentType: frontend
volumeMounts:
- name: model-cache
mountPoint: /opt/models
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
workingDir: /workspace/examples/backends/vllm
envs:
- name: HF_HOME
value: /opt/models
replicas: 1
VllmPrefillWorker:
componentType: worker
envFromSecret: hf-token-secret
volumeMounts:
- name: model-cache
mountPoint: /opt/models
sharedMemory:
size: 20Gi
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
env:
- name: DYN_LOG
value: "debug,dynamo_llm::kv_router=trace"
- name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: HF_HOME
value: /opt/models
- name: DYN_STORE_KV
value: "mem"
- name: DYN_ROUTER_MODE
value: "kv"
VllmDecodeWorker:
envFromSecret: hf-token-secret
componentType: worker
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.0
workingDir: /workspace/examples/backends/vllm
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-0.6B
env:
- name: DYN_STORE_KV
value: "mem"
gpu: "4"
requests:
gpu: "4"
......@@ -27,7 +27,7 @@ spec:
namespace: kgateway-system
rules:
- backendRefs:
- group: inference.networking.x-k8s.io
- group: inference.networking.k8s.io
kind: InferencePool
name: llama3-70b-agg-pool
port: 8000
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove the namespace field if using kubectl apply -n
apiVersion: v1
kind: ConfigMap
metadata:
name: epp-config
labels:
app.kubernetes.io/name: dynamo-gaie
data:
epp-config-dynamo.yaml: |
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
# Required: tells EPP which profile to use (even if you only have one)
- type: single-profile-handler
# Picker: chooses the final endpoint after scoring
- name: picker
type: max-score-picker
- name: dyn-pre
type: dynamo-inject-workerid
parameters: {}
- name: dyn-kv
type: kv-aware-scorer
- name: dyn-cleanup
type: dynamo-cleanup
schedulingProfiles:
- name: default
plugins:
- pluginRef: dyn-kv
weight: 1
- pluginRef: picker
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: Update the namespace field below to match your deployment namespace
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama3-70b-agg-epp
labels:
app: llama3-70b-agg-epp
spec:
replicas: 1
selector:
matchLabels:
app: llama3-70b-agg-epp
template:
metadata:
labels:
app: llama3-70b-agg-epp
spec:
serviceAccountName: epp-sa
terminationGracePeriodSeconds: 130
imagePullSecrets:
- name: docker-imagepullsecret
containers:
- name: epp
image: nvcr.io/nvidia/ai-dynamo/frontend:0.8.0
imagePullPolicy: IfNotPresent
resources:
requests:
memory: "1Gi"
cpu: "1"
limits:
memory: "2Gi"
cpu: "2"
command: ["/bin/sh", "-c"]
args:
- >
exec /epp
-poolName "llama3-70b-agg-pool"
-poolNamespace "$POD_NAMESPACE"
-v 4 --zap-encoder json
-grpcPort 9002 -grpcHealthPort 9003
-configFile /etc/epp/epp-config-dynamo.yaml
volumeMounts:
- name: epp-config
mountPath: /etc/epp
readOnly: true
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_UID
valueFrom:
fieldRef:
fieldPath: metadata.uid
- name: PLATFORM_NAMESPACE
value: "$(POD_NAMESPACE)" # set to your dynamo platform namespace if different
# if you want to use etcd enable this and remove the DYN_DISCOVERY_BACKEND env var
# - name: ETCD_ENDPOINTS
# value: "dynamo-platform-etcd.$(PLATFORM_NAMESPACE):2379" # update dynamo-platform to appropriate namespace
- name: NATS_SERVER
value: "nats://dynamo-platform-nats.$(PLATFORM_NAMESPACE):4222" # update dynamo-platform to appropriate namespace
- name: DYNAMO_NAMESPACE
value: "$(POD_NAMESPACE)-llama3-70b-agg"
- name: DYN_MODEL
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: DYN_KV_BLOCK_SIZE
value: "128" # UPDATE to match the --block-size in your deploy.yaml engine command
- name: USE_STREAMING
value: "true"
- name: DYN_ENFORCE_DISAGG
value: "false"
- name: DYN_DISCOVERY_BACKEND
value: "kubernetes"
ports:
- containerPort: 9002
- containerPort: 9003
- name: metrics
containerPort: 9090
livenessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
grpc:
port: 9003
service: inference-extension
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: epp-config
configMap:
name: epp-config
items:
- key: epp-config-dynamo.yaml
path: epp-config-dynamo.yaml
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: Update the namespace field below to match your deployment namespace
apiVersion: v1
kind: Service
metadata:
name: llama3-70b-agg-epp
spec:
selector:
app: llama3-70b-agg-epp
ports:
- protocol: TCP
port: 9002
targetPort: 9002
appProtocol: http2
type: ClusterIP
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove the namespace field if using kubectl apply -n
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
metadata:
name: llama3-70b-agg-model
spec:
criticality: Critical
modelName: RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
poolRef:
group: inference.networking.x-k8s.io
kind: InferencePool
name: llama3-70b-agg-pool
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You can remove the namespace field if using kubectl apply -n
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
metadata:
name: llama3-70b-agg-pool
spec:
targetPortNumber: 8000
selector:
nvidia.com/dynamo-component-type: frontend
extensionRef:
failureMode: FailOpen
group: ""
kind: Service
name: llama3-70b-agg-epp
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read
rules:
# Gateway API inference resources
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencepools"]
verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["inferencemodels"]
verbs: ["get", "watch", "list"]
# Core resources for pod discovery
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
# Dynamo k8s service discovery - endpointslices
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "list", "watch"]
# Dynamo k8s service discovery - worker metadata CRs
- apiGroups: ["nvidia.com"]
resources: ["dynamoworkermetadatas"]
verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
# Authentication/authorization
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: ClusterRoleBinding is cluster-scoped (no metadata.namespace)
# The subjects.namespace field specifies where the ServiceAccount is located
# This CANNOT be removed - it must match your deployment namespace
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: pod-read-binding
# no metadata.namespace - kubectl -n sets it
subjects:
- kind: ServiceAccount
name: epp-sa
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: pod-read
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: v1
kind: ServiceAccount
metadata:
name: epp-sa
# no metadata.namespace (kubectl -n sets it)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment