feat: update DynamoGraphDeployments for vllm_v1 (#1890)

Co-authored-by: mohammedabdulwahhab <furkhan324@berkeley.edu>

feat: update DynamoGraphDeployments for vllm_v1 (#1890)
Co-authored-by: mohammedabdulwahhab <furkhan324@berkeley.edu>
5bf23d54 · hhzhang16 · GitHub · 9e76590f · 5bf23d54 · 5bf23d54
Unverified Commit 5bf23d54 authored Jul 14, 2025 by hhzhang16 Committed by GitHub Jul 14, 2025
6 changed files
--- a/examples/vllm/README.md
+++ b/examples/vllm/README.md
@@ -116,6 +116,40 @@ bash launch/dep.sh
 > [!TIP]
 > Run a disaggregated example and try adding another prefill worker once the setup is running! The system will automatically discover and utilize the new worker.
+### Kubernetes Deployment
+For Kubernetes deployment, YAML manifests are provided in the `deploy/` directory. These define DynamoGraphDeployment resources for various configurations:
+- `agg.yaml` - Aggregated serving
+- `agg_router.yaml` - Aggregated serving with KV routing
+- `disagg.yaml` - Disaggregated serving
+- `disagg_router.yaml` - Disaggregated serving with KV routing
+#### Prerequisites
+- **Dynamo Cloud**: Follow the [Quickstart Guide](../../docs/guides/dynamo_deploy/quickstart.md) to deploy Dynamo Cloud first.
+- **Container Images**: The deployment files currently require access to `nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime`. If you don't have access, build and push your own image:
+  ```bash
+  ./container/build.sh --framework VLLM_V1
+  # Tag and push to your container registry
+  # Update the image references in the YAML files
+  ```
+- **Port Forwarding**: After deployment, forward the frontend service to access the API:
+  ```bash
+  kubectl port-forward deployment/vllm-v1-disagg-frontend-<pod-uuid-info> 8080:8000
+  ```
+#### Deploy to Kubernetes
+Example with disagg:
+```bash
+cd ~/dynamo/examples/vllm/deploy
+kubectl apply -f disagg.yaml
+```
 ### Testing the Deployment
 Send a test request to verify your deployment:

--- a/examples/vllm/deploy/agg.yaml
+++ b/examples/vllm/deploy/agg.yaml
@@ -15,10 +15,28 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: agg
+  name: vllm-v1-agg
 spec:
  services:
    Frontend:
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8000
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
      dynamoNamespace: vllm-v1-agg
      componentType: main
      replicas: 1
@@ -31,50 +49,38 @@ spec:
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm_v1
+          workingDir: /workspace/examples/vllm
          args:
            - dynamo
-            - serve
+            - run
-            - graphs.agg:Frontend
+            - in=http
-            - --system-app-port
+            - out=dyn
-            - "5000"
+            - --http-port
-            - --enable-system-app
+            - "8000"
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-            - -f
-            - ./configs/agg.yaml
-    SimpleLoadBalancer:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: vllm-v1-agg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "20Gi"
-        limits:
-          cpu: "1"
-          memory: "20Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v1
-          args:
-            - dynamo
-            - serve
-            - graphs.agg:SimpleLoadBalancer
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - SimpleLoadBalancer
-            - -f
-            - ./configs/agg.yaml
    VllmDecodeWorker:
      envFromSecret: hf-token-secret
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
      dynamoNamespace: vllm-v1-agg
+      componentType: worker
      replicas: 1
      resources:
        requests:
@@ -87,17 +93,7 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm_v1
+          workingDir: /workspace/examples/vllm
          args:
-            - dynamo
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
-            - serve
-            - graphs.agg:VllmDecodeWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmDecodeWorker
-            - -f
-            - ./configs/agg.yaml
--- a/examples/vllm/deploy/agg_router.yaml
+++ b/examples/vllm/deploy/agg_router.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-v1-agg
+spec:
+  services:
+    Frontend:
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8000
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      dynamoNamespace: vllm-v1-agg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          workingDir: /workspace/examples/vllm
+          args:
+            - dynamo
+            - run
+            - in=http
+            - out=dyn
+            - --http-port
+            - "8000"
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      dynamoNamespace: vllm-v1-agg
+      componentType: worker
+      replicas: 2
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          workingDir: /workspace/examples/vllm
+          args:
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
--- a/examples/vllm/deploy/disagg.yaml
+++ b/examples/vllm/deploy/disagg.yaml
@@ -15,13 +15,31 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: disagg
+  name: vllm-v1-disagg
 spec:
  services:
    Frontend:
      dynamoNamespace: vllm-v1-disagg
      componentType: main
      replicas: 1
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8000
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
      resources:
        requests:
          cpu: "1"
@@ -31,51 +49,39 @@ spec:
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm_v1
+          workingDir: /workspace/examples/vllm
          args:
            - dynamo
-            - serve
+            - run
-            - graphs.disagg:Frontend
+            - in=http
-            - --system-app-port
+            - out=dyn
-            - "5000"
+            - --http-port
-            - --enable-system-app
+            - "8000"
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-            - -f
-            - ./configs/disagg.yaml
-    SimpleLoadBalancer:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: vllm-v1-disagg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "20Gi"
-        limits:
-          cpu: "1"
-          memory: "20Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v1
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:SimpleLoadBalancer
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - SimpleLoadBalancer
-            - -f
-            - ./configs/disagg.yaml
    VllmDecodeWorker:
      dynamoNamespace: vllm-v1-disagg
      envFromSecret: hf-token-secret
+      componentType: worker
      replicas: 1
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
      resources:
        requests:
          cpu: "10"
@@ -87,24 +93,34 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm_v1
+          workingDir: /workspace/examples/vllm
          args:
-            - dynamo
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
-            - serve
-            - graphs.disagg:VllmDecodeWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmDecodeWorker
-            - -f
-            - ./configs/disagg.yaml
    VllmPrefillWorker:
      dynamoNamespace: vllm-v1-disagg
      envFromSecret: hf-token-secret
+      componentType: worker
      replicas: 1
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
      resources:
        requests:
          cpu: "10"
@@ -116,17 +132,7 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm_v1
+          workingDir: /workspace/examples/vllm
          args:
-            - dynamo
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
-            - serve
-            - graphs.disagg:VllmPrefillWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmPrefillWorker
-            - -f
-            - ./configs/disagg.yaml
--- a/examples/vllm/deploy/disagg_planner.yaml
+++ b/examples/vllm/deploy/disagg_planner.yaml
@@ -15,172 +15,124 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: disagg-planner
+  name: vllm-v1-disagg-planner
 spec:
  services:
    Frontend:
      dynamoNamespace: vllm-v1-disagg-planner
      componentType: main
      replicas: 1
-      resources:
+      livenessProbe:
-        requests:
+        httpGet:
-          cpu: "2"
+          path: /health
-          memory: "4Gi"
+          port: 8000
-        limits:
+        initialDelaySeconds: 60
-          cpu: "2"
+        periodSeconds: 60
-          memory: "4Gi"
+        timeoutSeconds: 30
-      extraPodSpec:
+        failureThreshold: 10
-        mainContainer:
+      readinessProbe:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        exec:
-          workingDir: /workspace/examples/vllm_v1
+          command:
-          args:
+            - /bin/sh
-            - dynamo
+            - -c
-            - serve
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
-            - graphs.disagg_planner:Frontend
+        initialDelaySeconds: 60
-            - --system-app-port
+        periodSeconds: 60
-            - "5000"
+        timeoutSeconds: 30
-            - --enable-system-app
+        failureThreshold: 10
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-            - -f
-            - ./configs/disagg_planner.yaml
-    SimpleLoadBalancer:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: vllm-v1-disagg-planner
-      replicas: 1
      resources:
        requests:
          cpu: "1"
-          memory: "20Gi"
+          memory: "2Gi"
        limits:
          cpu: "1"
-          memory: "20Gi"
+          memory: "2Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm_v1
+          workingDir: /workspace/examples/vllm
          args:
            - dynamo
-            - serve
+            - run
-            - graphs.disagg_planner:SimpleLoadBalancer
+            - in=http
-            - --system-app-port
+            - out=dyn
-            - "5000"
+            - --http-port
-            - --enable-system-app
+            - "8000"
-            - --use-default-health-checks
-            - --service-name
-            - SimpleLoadBalancer
-            - -f
-            - ./configs/disagg_planner.yaml
    VllmDecodeWorker:
      dynamoNamespace: vllm-v1-disagg-planner
      envFromSecret: hf-token-secret
+      componentType: worker
      replicas: 1
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
      resources:
        requests:
-          cpu: "20"
+          cpu: "10"
-          memory: "40Gi"
+          memory: "20Gi"
-          gpu: "2"
+          gpu: "1"
        limits:
-          cpu: "20"
+          cpu: "10"
-          memory: "40Gi"
+          memory: "20Gi"
-          gpu: "2"
+          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm_v1
+          workingDir: /workspace/examples/vllm
          args:
-            - dynamo
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
-            - serve
-            - graphs.disagg_planner:VllmDecodeWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmDecodeWorker
-            - -f
-            - ./configs/disagg_planner.yaml
    VllmPrefillWorker:
      dynamoNamespace: vllm-v1-disagg-planner
      envFromSecret: hf-token-secret
+      componentType: worker
      replicas: 1
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
      resources:
        requests:
-          cpu: "20"
+          cpu: "10"
-          memory: "40Gi"
+          memory: "20Gi"
-          gpu: "2"
+          gpu: "1"
-        limits:
-          cpu: "20"
-          memory: "40Gi"
-          gpu: "2"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v1
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_planner:VllmPrefillWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmPrefillWorker
-            - -f
-            - ./configs/disagg_planner.yaml
-    Planner:
-      dynamoNamespace: vllm-v1-disagg-planner
-      replicas: 1
-      componentType: planner
-      resources:
-        requests:
-          cpu: "2"
-          memory: "2Gi"
-        limits:
-          cpu: "2"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v1
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg_planner:Planner
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Planner
-            - --Planner.environment=kubernetes
-            - -f
-            - ./configs/disagg_planner.yaml
-    Prometheus:
-      dynamoNamespace: vllm-v1-disagg-planner
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1000m"
-          memory: "1000Mi"
        limits:
-          cpu: "1000m"
+          cpu: "10"
-          memory: "1000Mi"
+          memory: "20Gi"
+          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
-          workingDir: /workspace/examples/vllm_v1
+          workingDir: /workspace/examples/vllm
          args:
-            - dynamo
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
-            - serve
-            - graphs.disagg_planner:Prometheus
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Prometheus
-            - -f
-            - ./configs/disagg_planner.yaml
--- a/examples/vllm/deploy/disagg_router.yaml
+++ b/examples/vllm/deploy/disagg_router.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-v1-disagg-router
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-disagg-router
+      componentType: main
+      replicas: 1
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8000
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          workingDir: /workspace/examples/vllm
+          args:
+            - dynamo
+            - run
+            - in=http
+            - out=dyn
+            - --http-port
+            - "8000"
+    VllmDecodeWorker:
+      dynamoNamespace: vllm-v1-disagg-router
+      envFromSecret: hf-token-secret
+      componentType: worker
+      replicas: 2
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          workingDir: /workspace/examples/vllm
+          args:
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+    VllmPrefillWorker:
+      dynamoNamespace: vllm-v1-disagg-router
+      envFromSecret: hf-token-secret
+      componentType: worker
+      replicas: 1
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          workingDir: /workspace/examples/vllm
+          args:
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"