feat: add multimodal k8s deployment manifest (#1836)

e1ae0f15 · Biswa Panda · GitHub · 39c8d125 · e1ae0f15 · e1ae0f15
Unverified Commit e1ae0f15 authored Jul 10, 2025 by Biswa Panda Committed by GitHub Jul 10, 2025
8 changed files
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -428,3 +428,60 @@ You should see a response describing the video's content similar to
  ]
 }
 ```
+
+
+## Deploying Multimodal Examples on Kubernetes
+
+This guide will help you quickly deploy and clean up the multimodal example services in Kubernetes.
+
+### Prerequisites
+
+- **Dynamo Cloud** is already deployed in your target Kubernetes namespace.
+- You have `kubectl` access to your cluster and the correct namespace set in `$NAMESPACE`.
+
+
+### Create a secret with huggingface token
+
+```bash
+export HF_TOKEN="huggingfacehub token with read permission to models"
+kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n $KUBE_NS || true
+```
+
+---
+
+Choose the example you want to deploy or delete. The YAML files are located in `examples/multimodal/deploy/k8s/`.
+
+### Deploy the Multimodal Example
+
+```bash
+kubectl apply -f examples/multimodal/deploy/k8s/<Example yaml file> -n $NAMESPACE
+```
+
+### Uninstall the Multimodal Example
+
+
+```bash
+kubectl delete -f examples/multimodal/deploy/k8s/<Example yaml file> -n $NAMESPACE
+```
+
+### Using a different dynamo container
+
+To customize the container image used in your deployment, you will need to update the manifest before applying it.
+
+You can use [`yq`](https://github.com/mikefarah/yq?tab=readme-ov-file#install), a portable command-line YAML processor.
+
+Please follow the [installation instructions](https://github.com/mikefarah/yq?tab=readme-ov-file#install) for your platform if you do not already have `yq` installed. After installing `yq`, you can generate and apply your manifest as follows:
+
+
+```bash
+export DYNAMO_IMAGE=my-registry/my-image:tag
+
+yq '.spec.services.[].extraPodSpec.mainContainer.image = env(DYNAMO_IMAGE)' $EXAMPLE_FILE > my_example_manifest.yaml
+
+# install the dynamo example
+kubectl apply -f my_example_manifest.yaml -n $NAMESPACE
+
+# uninstall the dynamo example
+kubectl delete -f my_example_manifest.yaml -n $NAMESPACE
+
+```
\ No newline at end of file
--- a/examples/multimodal/deploy/k8s/agg-llava.yaml
+++ b/examples/multimodal/deploy/k8s/agg-llava.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg-llava
+spec:
+  envs:
+  services:
+    Frontend:
+      dynamoNamespace: agg-llava
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+            - -f
+            - ./configs/agg-llava.yaml
+    Processor:
+      dynamoNamespace: agg-llava
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+            - -f
+            - ./configs/agg-llava.yaml
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-llava
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+            - -f
+            - ./configs/agg-llava.yaml
+    VllmEncodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-llava
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmEncodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmEncodeWorker
+            - -f
+            - ./configs/agg-llava.yaml
\ No newline at end of file
--- a/examples/multimodal/deploy/k8s/agg-phi3v.yaml
+++ b/examples/multimodal/deploy/k8s/agg-phi3v.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg-phi3v
+spec:
+  envs:
+  services:
+    Frontend:
+      dynamoNamespace: agg-phi3v
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+            - -f
+            - ./configs/agg-phi3v.yaml
+    Processor:
+      dynamoNamespace: agg-phi3v
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+            - -f
+            - ./configs/agg-phi3v.yaml
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-phi3v
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+            - -f
+            - ./configs/agg-phi3v.yaml
+    VllmEncodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-phi3v
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmEncodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmEncodeWorker
+            - -f
+            - ./configs/agg-phi3v.yaml
\ No newline at end of file
--- a/examples/multimodal/deploy/k8s/agg-qwen.yaml
+++ b/examples/multimodal/deploy/k8s/agg-qwen.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg-qwen
+spec:
+  envs:
+  services:
+    Frontend:
+      dynamoNamespace: agg-qwen
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+            - -f
+            - ./configs/agg-qwen.yaml
+    Processor:
+      dynamoNamespace: agg-qwen
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+            - -f
+            - ./configs/agg-qwen.yaml
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-qwen
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+            - -f
+            - ./configs/agg-qwen.yaml
+    VllmEncodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-qwen
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmEncodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmEncodeWorker
+            - -f
+            - ./configs/agg-qwen.yaml
\ No newline at end of file
--- a/examples/multimodal/deploy/k8s/agg-video.yaml
+++ b/examples/multimodal/deploy/k8s/agg-video.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg-video
+spec:
+  envs:
+  services:
+    Frontend:
+      dynamoNamespace: agg-video
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg_video:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+            - -f
+            - ./configs/agg_video.yaml
+    Processor:
+      dynamoNamespace: agg-video
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg_video:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+            - -f
+            - ./configs/agg_video.yaml
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-video
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg_video:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+            - -f
+            - ./configs/agg_video.yaml
+    VllmEncodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: agg-video
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.agg_video:VllmEncodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmEncodeWorker
+            - -f
+            - ./configs/agg_video.yaml
\ No newline at end of file
--- a/examples/multimodal/deploy/k8s/disagg-video.yaml
+++ b/examples/multimodal/deploy/k8s/disagg-video.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg-video
+spec:
+  envs:
+  services:
+    Frontend:
+      dynamoNamespace: disagg-video
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_video:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+            - -f
+            - ./configs/disagg_video.yaml
+    Processor:
+      dynamoNamespace: disagg-video
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_video:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+            - -f
+            - ./configs/disagg_video.yaml
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: disagg-video
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_video:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+            - -f
+            - ./configs/disagg_video.yaml
+    VllmEncodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: disagg-video
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_video:VllmEncodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmEncodeWorker
+            - -f
+            - ./configs/disagg_video.yaml
+    VllmPrefillWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: disagg-video
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_video:VllmPrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmPrefillWorker
+            - -f
+            - ./configs/disagg_video.yaml
\ No newline at end of file
--- a/examples/multimodal/deploy/k8s/disagg.yaml
+++ b/examples/multimodal/deploy/k8s/disagg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg-llava
+spec:
+  envs:
+  services:
+    Frontend:
+      dynamoNamespace: disagg-llava
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+            - -f
+            - ./configs/disagg.yaml
+    Processor:
+      dynamoNamespace: disagg-llava
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+            - -f
+            - ./configs/disagg.yaml
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: disagg-llava
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+            - -f
+            - ./configs/disagg.yaml
+    VllmEncodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: disagg-llava
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmEncodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmEncodeWorker
+            - -f
+            - ./configs/disagg.yaml
+    VllmPrefillWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: disagg-llava
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/multimodal
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmPrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmPrefillWorker
+            - -f
+            - ./configs/disagg.yaml
\ No newline at end of file
--- a/examples/multimodal/utils/model.py
+++ b/examples/multimodal/utils/model.py
@@ -21,7 +21,6 @@ from transformers import AutoConfig
 from utils.protocol import EncodeResponse
 from vllm import AsyncEngineArgs
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.worker.worker import Worker

 logger = logging.getLogger(__name__)

@@ -30,6 +29,9 @@ def load_vision_model(model_id: str) -> torch.nn.Module:
    """
    Load a vision model from a HuggingFace model ID.
    """
+    # lazy import to avoid cuda error if not on gpu
+    from vllm.worker.worker import Worker
+
    engine_args = AsyncEngineArgs(model=model_id, trust_remote_code=True)

    engine_config = engine_args.create_engine_config()