feat: approx kv router deployment example (#5037)

c75bb583 · Biswa Panda · GitHub · 6caac575 · c75bb583
Unverified Commit c75bb583 authored Dec 19, 2025 by Biswa Panda Committed by GitHub Dec 19, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 52 additions and 0 deletions

examples/backends/vllm/deploy/agg_router_kv_approx.yaml examples/backends/vllm/deploy/agg_router_kv_approx.yaml +52 -0

No files found.
--- a/examples/backends/vllm/deploy/agg_router_kv_approx.yaml
+++ b/examples/backends/vllm/deploy/agg_router_kv_approx.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+## This example demonstrates KV-aware routing with the --no-kv-events flag.
+## Instead of receiving KV events from workers, the router predicts cache state
+## locally based on routing decisions with TTL-based expiration and pruning.
+## Note: This mode does not require NATS or JetStream during dynamo platform deployment.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-agg-router-kv-approx
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-agg-router-kv-approx
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          command:
+            - python3
+          args:
+            - -m
+            - dynamo.frontend
+            - --router-mode
+            - kv
+            - --no-kv-events
+      envs:
+        - name: DYN_ROUTER_MODE
+          value: kv
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-agg-router-kv-approx
+      componentType: worker
+      replicas: 2
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/examples/backends/vllm
+          command:
+          - python3
+          - -m
+          - dynamo.vllm
+          args:
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --kv-events-config
+            - '{"enable_kv_cache_events": false}'