Unverified Commit e1ae0f15 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat: add multimodal k8s deployment manifest (#1836)

parent 39c8d125
......@@ -428,3 +428,60 @@ You should see a response describing the video's content similar to
]
}
```
## Deploying Multimodal Examples on Kubernetes
This guide will help you quickly deploy and clean up the multimodal example services in Kubernetes.
### Prerequisites
- **Dynamo Cloud** is already deployed in your target Kubernetes namespace.
- You have `kubectl` access to your cluster and the correct namespace set in `$NAMESPACE`.
### Create a secret with huggingface token
```bash
export HF_TOKEN="huggingfacehub token with read permission to models"
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=$HF_TOKEN -n $KUBE_NS || true
```
---
Choose the example you want to deploy or delete. The YAML files are located in `examples/multimodal/deploy/k8s/`.
### Deploy the Multimodal Example
```bash
kubectl apply -f examples/multimodal/deploy/k8s/<Example yaml file> -n $NAMESPACE
```
### Uninstall the Multimodal Example
```bash
kubectl delete -f examples/multimodal/deploy/k8s/<Example yaml file> -n $NAMESPACE
```
### Using a different dynamo container
To customize the container image used in your deployment, you will need to update the manifest before applying it.
You can use [`yq`](https://github.com/mikefarah/yq?tab=readme-ov-file#install), a portable command-line YAML processor.
Please follow the [installation instructions](https://github.com/mikefarah/yq?tab=readme-ov-file#install) for your platform if you do not already have `yq` installed. After installing `yq`, you can generate and apply your manifest as follows:
```bash
export DYNAMO_IMAGE=my-registry/my-image:tag
yq '.spec.services.[].extraPodSpec.mainContainer.image = env(DYNAMO_IMAGE)' $EXAMPLE_FILE > my_example_manifest.yaml
# install the dynamo example
kubectl apply -f my_example_manifest.yaml -n $NAMESPACE
# uninstall the dynamo example
kubectl delete -f my_example_manifest.yaml -n $NAMESPACE
```
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg-llava
spec:
envs:
services:
Frontend:
dynamoNamespace: agg-llava
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/agg-llava.yaml
Processor:
dynamoNamespace: agg-llava
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
- -f
- ./configs/agg-llava.yaml
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: agg-llava
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/agg-llava.yaml
VllmEncodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: agg-llava
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:VllmEncodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmEncodeWorker
- -f
- ./configs/agg-llava.yaml
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg-phi3v
spec:
envs:
services:
Frontend:
dynamoNamespace: agg-phi3v
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/agg-phi3v.yaml
Processor:
dynamoNamespace: agg-phi3v
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
- -f
- ./configs/agg-phi3v.yaml
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: agg-phi3v
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/agg-phi3v.yaml
VllmEncodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: agg-phi3v
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:VllmEncodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmEncodeWorker
- -f
- ./configs/agg-phi3v.yaml
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg-qwen
spec:
envs:
services:
Frontend:
dynamoNamespace: agg-qwen
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/agg-qwen.yaml
Processor:
dynamoNamespace: agg-qwen
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
- -f
- ./configs/agg-qwen.yaml
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: agg-qwen
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/agg-qwen.yaml
VllmEncodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: agg-qwen
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg:VllmEncodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmEncodeWorker
- -f
- ./configs/agg-qwen.yaml
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg-video
spec:
envs:
services:
Frontend:
dynamoNamespace: agg-video
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg_video:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/agg_video.yaml
Processor:
dynamoNamespace: agg-video
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg_video:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
- -f
- ./configs/agg_video.yaml
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: agg-video
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg_video:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/agg_video.yaml
VllmEncodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: agg-video
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.agg_video:VllmEncodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmEncodeWorker
- -f
- ./configs/agg_video.yaml
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg-video
spec:
envs:
services:
Frontend:
dynamoNamespace: disagg-video
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg_video:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/disagg_video.yaml
Processor:
dynamoNamespace: disagg-video
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg_video:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
- -f
- ./configs/disagg_video.yaml
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: disagg-video
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg_video:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/disagg_video.yaml
VllmEncodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: disagg-video
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg_video:VllmEncodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmEncodeWorker
- -f
- ./configs/disagg_video.yaml
VllmPrefillWorker:
envFromSecret: hf-token-secret
dynamoNamespace: disagg-video
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg_video:VllmPrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmPrefillWorker
- -f
- ./configs/disagg_video.yaml
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg-llava
spec:
envs:
services:
Frontend:
dynamoNamespace: disagg-llava
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
- -f
- ./configs/disagg.yaml
Processor:
dynamoNamespace: disagg-llava
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
- -f
- ./configs/disagg.yaml
VllmDecodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: disagg-llava
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg:VllmDecodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmDecodeWorker
- -f
- ./configs/disagg.yaml
VllmEncodeWorker:
envFromSecret: hf-token-secret
dynamoNamespace: disagg-llava
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg:VllmEncodeWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmEncodeWorker
- -f
- ./configs/disagg.yaml
VllmPrefillWorker:
envFromSecret: hf-token-secret
dynamoNamespace: disagg-llava
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/multimodal
args:
- dynamo
- serve
- graphs.disagg:VllmPrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmPrefillWorker
- -f
- ./configs/disagg.yaml
\ No newline at end of file
......@@ -21,7 +21,6 @@ from transformers import AutoConfig
from utils.protocol import EncodeResponse
from vllm import AsyncEngineArgs
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
from vllm.worker.worker import Worker
logger = logging.getLogger(__name__)
......@@ -30,6 +29,9 @@ def load_vision_model(model_id: str) -> torch.nn.Module:
"""
Load a vision model from a HuggingFace model ID.
"""
# lazy import to avoid cuda error if not on gpu
from vllm.worker.worker import Worker
engine_args = AsyncEngineArgs(model=model_id, trust_remote_code=True)
engine_config = engine_args.create_engine_config()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment