feat: add crds for vllm and llm examples (#1766)

Signed-off-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>

feat: add crds for vllm and llm examples (#1766)
Signed-off-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
5505507b · mohammedabdulwahhab · GitHub · 439e977d · 5505507b · 5505507b
Unverified Commit 5505507b authored Jul 07, 2025 by mohammedabdulwahhab Committed by GitHub Jul 07, 2025
10 changed files
--- a/examples/llm/deploy/agg.yaml
+++ b/examples/llm/deploy/agg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: llm-agg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
+  services:
+    Frontend:
+      dynamoNamespace: llm-agg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    Processor:
+      dynamoNamespace: llm-agg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-agg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
--- a/examples/llm/deploy/agg_router.yaml
+++ b/examples/llm/deploy/agg_router.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg-router
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
+  services:
+    Frontend:
+      dynamoNamespace: llm-agg-router
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.agg_router:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    Processor:
+      dynamoNamespace: llm-agg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.agg_router:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+    Router:
+      dynamoNamespace: llm-agg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.agg_router:Router
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Router
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-agg-router
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.agg_router:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
--- a/examples/llm/deploy/disagg.yaml
+++ b/examples/llm/deploy/disagg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: llm-disagg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
+  services:
+    Frontend:
+      dynamoNamespace: llm-disagg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    Processor:
+      dynamoNamespace: llm-disagg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-disagg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
+    PrefillWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-disagg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:PrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - PrefillWorker
--- a/examples/llm/deploy/disagg_router.yaml
+++ b/examples/llm/deploy/disagg_router.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg-router
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
+  services:
+    Frontend:
+      dynamoNamespace: llm-disagg-router
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_router:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    Processor:
+      dynamoNamespace: llm-disagg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_router:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+    Router:
+      dynamoNamespace: llm-disagg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_router:Router
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Router
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-disagg-router
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_router:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
+    PrefillWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-disagg-router
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/llm
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_router:PrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - PrefillWorker
--- a/examples/vllm_v0/deploy/agg.yaml
+++ b/examples/vllm_v0/deploy/agg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v0-agg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v0-agg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
--- a/examples/vllm_v0/deploy/disagg.yaml
+++ b/examples/vllm_v0/deploy/disagg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v0-disagg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    VllmWorker:
+      dynamoNamespace: vllm-v0-disagg
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
+    PrefillWorker:
+      dynamoNamespace: vllm-v0-disagg
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:PrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - PrefillWorker
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg-planner
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v0-disagg-planner
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "2"
+          memory: "4Gi"
+        limits:
+          cpu: "2"
+          memory: "4Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+
+    VllmWorker:
+      dynamoNamespace: vllm-v0-disagg-planner
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+        limits:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
+
+    PrefillWorker:
+      dynamoNamespace: vllm-v0-disagg-planner
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+        limits:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:PrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - PrefillWorker
+
+    Planner:
+      dynamoNamespace: vllm-v0-disagg-planner
+      replicas: 1
+      componentType: planner
+      resources:
+        requests:
+          cpu: "2"
+          memory: "2Gi"
+        limits:
+          cpu: "2"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Planner
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Planner
+            - --Planner.environment=kubernetes
+
+    Prometheus:
+      dynamoNamespace: vllm-v0-disagg-planner
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1000m"
+          memory: "1000Mi"
+        limits:
+          cpu: "1000m"
+          memory: "1000Mi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v0
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Prometheus
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Prometheus
--- a/examples/vllm_v1/deploy/agg.yaml
+++ b/examples/vllm_v1/deploy/agg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-agg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    SimpleLoadBalancer:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-agg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "20Gi"
+        limits:
+          cpu: "1"
+          memory: "20Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:SimpleLoadBalancer
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - SimpleLoadBalancer
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-agg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
--- a/examples/vllm_v1/deploy/disagg.yaml
+++ b/examples/vllm_v1/deploy/disagg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-disagg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    SimpleLoadBalancer:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-disagg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "20Gi"
+        limits:
+          cpu: "1"
+          memory: "20Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:SimpleLoadBalancer
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - SimpleLoadBalancer
+    VllmDecodeWorker:
+      dynamoNamespace: vllm-v1-disagg
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+    VllmPrefillWorker:
+      dynamoNamespace: vllm-v1-disagg
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmPrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmPrefillWorker
--- a/examples/vllm_v1/deploy/disagg_planner.yaml
+++ b/examples/vllm_v1/deploy/disagg_planner.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg-planner
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-disagg-planner
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "2"
+          memory: "4Gi"
+        limits:
+          cpu: "2"
+          memory: "4Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+
+    SimpleLoadBalancer:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-disagg-planner
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "20Gi"
+        limits:
+          cpu: "1"
+          memory: "20Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:SimpleLoadBalancer
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - SimpleLoadBalancer
+
+    VllmDecodeWorker:
+      dynamoNamespace: vllm-v1-disagg-planner
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+        limits:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+
+    VllmPrefillWorker:
+      dynamoNamespace: vllm-v1-disagg-planner
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+        limits:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:VllmPrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmPrefillWorker
+
+    Planner:
+      dynamoNamespace: vllm-v1-disagg-planner
+      replicas: 1
+      componentType: planner
+      resources:
+        requests:
+          cpu: "2"
+          memory: "2Gi"
+        limits:
+          cpu: "2"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Planner
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Planner
+            - --Planner.environment=kubernetes
+
+    Prometheus:
+      dynamoNamespace: vllm-v1-disagg-planner
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1000m"
+          memory: "1000Mi"
+        limits:
+          cpu: "1000m"
+          memory: "1000Mi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Prometheus
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Prometheus