feat: add vLLM v1 multi-modal example. Add llama4 Maverick example (#1990)

Signed-off-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Co-authored-by: krishung5 <krish@nvidia.com>

feat: add vLLM v1 multi-modal example. Add llama4 Maverick example (#1990)
Signed-off-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Co-authored-by: krishung5 <krish@nvidia.com>
353146e2 · GuanLuo · GitHub · 1f07dab7 · 353146e2 · 353146e2
Unverified Commit 353146e2 authored Jul 18, 2025 by GuanLuo Committed by GitHub Jul 18, 2025
3 changed files
--- a/examples/multimodal_v1/utils/logging.py
+++ b/examples/multimodal_v1/utils/logging.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+from dynamo._core import Client
+logger = logging.getLogger(__name__)
+async def check_required_workers(
+    workers_client: Client, required_workers: int, on_change=True, poll_interval=0.5
+):
+    """Wait until the minimum number of workers are ready."""
+    worker_ids = workers_client.instance_ids()
+    num_workers = len(worker_ids)
+    while num_workers < required_workers:
+        await asyncio.sleep(poll_interval)
+        worker_ids = workers_client.instance_ids()
+        new_count = len(worker_ids)
+        if (not on_change) or new_count != num_workers:
+            logger.info(
+                f"Waiting for more workers to be ready.\n"
+                f" Current: {new_count},"
+                f" Required: {required_workers}"
+            )
+        num_workers = new_count
+    print(f"Workers ready: {worker_ids}")
+    return worker_ids
--- a/examples/multimodal_v1/utils/model.py
+++ b/examples/multimodal_v1/utils/model.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Any, Dict, Tuple
+import torch
+from transformers import AutoConfig
+from utils.protocol import EncodeResponse
+from vllm import AsyncEngineArgs
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.worker import Worker
+# from transformers import AutoImageProcessor, LlavaForConditionalGeneration
+# from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+logger = logging.getLogger(__name__)
+def load_vision_model(model_id: str) -> torch.nn.Module:
+    """
+    Load a vision model from a HuggingFace model ID.
+    """
+    engine_args = AsyncEngineArgs(model=model_id, trust_remote_code=True)
+    engine_config = engine_args.create_engine_config()
+    distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
+    worker = Worker(
+        vllm_config=engine_config,
+        local_rank=0,
+        rank=0,
+        distributed_init_method=distributed_init_method,
+        is_driver_worker=True,
+    )
+    # Initialize the worker.
+    worker.init_device()
+    worker.load_model()
+    return worker.model_runner.model
+    # model = LlavaForConditionalGeneration.from_pretrained(
+    #     model_id, device_map="auto", torch_dtype=torch.float16
+    # ).eval()
+    # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    #     model_id, torch_dtype="auto", device_map="auto"
+    # ).eval()
+    # return model
+def get_vision_embeddings_info(
+    model_id: str, num_patches: int
+) -> Tuple[Tuple[int, int, int], torch.dtype]:
+    """Calculate vision embeddings size and dtype using model config
+    Returns a tuple of (batch_size, num_patches, hidden_dim), dtype.
+    """
+    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+    assert num_patches > 0, "Number of patches must be positive"
+    if not hasattr(config, "torch_dtype"):
+        raise ValueError("Model config missing required 'torch_dtype' attribute")
+    if not hasattr(config, "hidden_size"):
+        logger.warning(
+            "Model config missing required 'hidden_size' attribute, using 4096"
+        )
+        hidden_size = 4096
+    else:
+        hidden_size = config.hidden_size
+    return (1, num_patches, hidden_size), config.torch_dtype
+def construct_mm_data(
+    model: str,
+    encode_output: EncodeResponse,
+    image_embeds: torch.Tensor,
+    embeddings_dtype: torch.dtype,
+) -> Dict[str, torch.Tensor | Dict[str, Any]]:
+    """Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
+    image_embeds = image_embeds.to(embeddings_dtype)
+    if "Qwen2" in model:
+        return {
+            "image": {
+                "image_embeds": image_embeds.squeeze(0),
+                "image_grid_thw": torch.tensor(encode_output.image_grid_thw).squeeze(0),
+            }
+        }
+    elif "MiniCPM-V" in model:
+        return {
+            "image": {
+                "image_embeds": image_embeds,
+                "image_sizes": encode_output.image_sizes,
+            }
+        }
+    else:
+        return {"image": image_embeds}
--- a/examples/multimodal_v1/utils/protocol.py
+++ b/examples/multimodal_v1/utils/protocol.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from typing import Any, List, Literal, Optional, Union
+import connect
+import msgspec
+from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic_core import core_schema
+from typing_extensions import NotRequired
+from vllm.inputs.data import TokensPrompt
+from vllm.outputs import CompletionOutput
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import PromptLogprobs, RequestMetrics
+class Request(BaseModel):
+    prompt: str
+    sampling_params: dict
+class Tokens(BaseModel):
+    tokens: list[int]
+class PrefillRequest(Request):
+    request_id: str
+class Response(BaseModel):
+    text: str
+class PrefillResponse(BaseModel):
+    prefilled: bool
+# Hack to override the type of multi_modal_data in TokensPrompt
+# as pydantic doesn't understand generic types
+# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
+# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
+# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
+class PatchedTokensPrompt(TokensPrompt):
+    multi_modal_data: NotRequired[Optional[Any]]  # type: ignore
+# Monkey-patch the SamplingParams type to add a dummy core schema so pydantic can validate it
+# Sampling params is a mspspec struct
+# SamplingParams is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/sampling_params.py#L88
+SamplingParams.__get_pydantic_core_schema__ = classmethod(
+    lambda cls, source, handler: core_schema.any_schema()
+)
+class vLLMGenerateRequest(BaseModel):
+    """
+    Serializable class of all the fields vLLM engine requires for inference
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    engine_prompt: PatchedTokensPrompt
+    sampling_params: SamplingParams
+    request_id: str
+    prefix_hit_rate: Optional[float] = 0.0
+    @field_validator("sampling_params", mode="before")
+    @classmethod
+    def parse_sampling_params(cls, v: Any) -> SamplingParams:
+        if isinstance(v, str):
+            v = json.loads(v)
+        if isinstance(v, dict):
+            return SamplingParams(**v)
+        return v
+    model_config = ConfigDict(
+        json_encoders={SamplingParams: lambda v: msgspec.json.encode(v)}
+    )
+class TextContent(BaseModel):
+    type: Literal["text"]
+    text: str
+class ImageURLDetail(BaseModel):
+    url: str
+class ImageContent(BaseModel):
+    type: Literal["image_url"]
+    image_url: ImageURLDetail
+MessageContent = Union[TextContent, ImageContent]
+class ChatMessage(BaseModel):
+    role: Literal["user", "system", "assistant"]
+    content: List[MessageContent]
+class MultiModalRequest(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    model: str
+    messages: List[ChatMessage]
+    max_tokens: Optional[int] = None
+    temperature: Optional[float] = None
+    stream: Optional[bool] = True
+class vLLMMultimodalRequest(vLLMGenerateRequest):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    image_url: Optional[str] = None
+    # image_features: Optional[List[List[List[float]]]] = None # Remove once have NIXL support
+    serialized_request: Optional[connect.SerializedRequest] = None
+class EncodeRequest(BaseModel):
+    """
+    Serializable class of all the fields vLLM engine requires for inference
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    image_url: str
+    request_id: str
+    serialized_request: Optional[connect.SerializedRequest] = None
+class EncodeResponse(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    request_id: str
+    image_grid_thw: Optional[List[Any]] = None
+    image_sizes: Optional[List[Any]] = None
+    serialized_request: Optional[connect.SerializedRequest] = None
+    image_features: List[List[List[float]]]  # Remove once have NIXL support
+class MyRequestOutput(BaseModel):
+    """
+    RequestOutput from vLLM is not serializable by default
+    https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
+    This class is used to serialize the RequestOutput and any recursively defined types
+    We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    request_id: str
+    prompt: Optional[str] = None
+    prompt_token_ids: Optional[List[int]] = None
+    prompt_logprobs: Optional[PromptLogprobs] = None
+    outputs: List[CompletionOutput]
+    finished: bool
+    metrics: Optional[RequestMetrics] = None
+    kv_transfer_params: Optional[dict[str, Any]] = None
+    # lora_request: Optional[LoRARequest] = None
+    # encoder_prompt: Optional[str] = None
+    # encoder_prompt_token_ids: Optional[List[int]] = None
+    # num_cached_tokens: Optional[int] = None
+    # multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None