fix: Revert "feat: add vLLM v1 multi-modal example. Add llama4 Maverick ex… (#2017)

fe63c17a · Alec · GitHub · bf1998f0 · bf1998f0 · bf1998f0
Unverified Commit fe63c17a authored Jul 18, 2025 by Alec Committed by GitHub Jul 19, 2025
3 changed files
--- a/examples/multimodal_v1/utils/logging.py
+++ b/examples/multimodal_v1/utils/logging.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import logging
-
-from dynamo._core import Client
-
-logger = logging.getLogger(__name__)
-
-
-async def check_required_workers(
-    workers_client: Client, required_workers: int, on_change=True, poll_interval=0.5
-):
-    """Wait until the minimum number of workers are ready."""
-    worker_ids = workers_client.instance_ids()
-    num_workers = len(worker_ids)
-
-    while num_workers < required_workers:
-        await asyncio.sleep(poll_interval)
-        worker_ids = workers_client.instance_ids()
-        new_count = len(worker_ids)
-
-        if (not on_change) or new_count != num_workers:
-            logger.info(
-                f"Waiting for more workers to be ready.\n"
-                f" Current: {new_count},"
-                f" Required: {required_workers}"
-            )
-        num_workers = new_count
-
-    print(f"Workers ready: {worker_ids}")
-    return worker_ids
--- a/examples/multimodal_v1/utils/model.py
+++ b/examples/multimodal_v1/utils/model.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from typing import Any, Dict, Tuple
-
-import torch
-from transformers import AutoConfig
-from utils.protocol import EncodeResponse
-from vllm import AsyncEngineArgs
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.worker.worker import Worker
-
-# from transformers import AutoImageProcessor, LlavaForConditionalGeneration
-# from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-
-
-logger = logging.getLogger(__name__)
-
-
-def load_vision_model(model_id: str) -> torch.nn.Module:
-    """
-    Load a vision model from a HuggingFace model ID.
-    """
-    engine_args = AsyncEngineArgs(model=model_id, trust_remote_code=True)
-
-    engine_config = engine_args.create_engine_config()
-    distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
-    worker = Worker(
-        vllm_config=engine_config,
-        local_rank=0,
-        rank=0,
-        distributed_init_method=distributed_init_method,
-        is_driver_worker=True,
-    )
-    # Initialize the worker.
-    worker.init_device()
-    worker.load_model()
-    return worker.model_runner.model
-    # model = LlavaForConditionalGeneration.from_pretrained(
-    #     model_id, device_map="auto", torch_dtype=torch.float16
-    # ).eval()
-
-    # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    #     model_id, torch_dtype="auto", device_map="auto"
-    # ).eval()
-    # return model
-
-
-def get_vision_embeddings_info(
-    model_id: str, num_patches: int
-) -> Tuple[Tuple[int, int, int], torch.dtype]:
-    """Calculate vision embeddings size and dtype using model config
-    Returns a tuple of (batch_size, num_patches, hidden_dim), dtype.
-    """
-    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
-    assert num_patches > 0, "Number of patches must be positive"
-    if not hasattr(config, "torch_dtype"):
-        raise ValueError("Model config missing required 'torch_dtype' attribute")
-    if not hasattr(config, "hidden_size"):
-        logger.warning(
-            "Model config missing required 'hidden_size' attribute, using 4096"
-        )
-        hidden_size = 4096
-    else:
-        hidden_size = config.hidden_size
-    return (1, num_patches, hidden_size), config.torch_dtype
-
-
-def construct_mm_data(
-    model: str,
-    encode_output: EncodeResponse,
-    image_embeds: torch.Tensor,
-    embeddings_dtype: torch.dtype,
-) -> Dict[str, torch.Tensor | Dict[str, Any]]:
-    """Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
-    image_embeds = image_embeds.to(embeddings_dtype)
-    if "Qwen2" in model:
-        return {
-            "image": {
-                "image_embeds": image_embeds.squeeze(0),
-                "image_grid_thw": torch.tensor(encode_output.image_grid_thw).squeeze(0),
-            }
-        }
-    elif "MiniCPM-V" in model:
-        return {
-            "image": {
-                "image_embeds": image_embeds,
-                "image_sizes": encode_output.image_sizes,
-            }
-        }
-    else:
-        return {"image": image_embeds}
--- a/examples/multimodal_v1/utils/protocol.py
+++ b/examples/multimodal_v1/utils/protocol.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-from typing import Any, List, Literal, Optional, Union
-
-import connect
-import msgspec
-from pydantic import BaseModel, ConfigDict, field_validator
-from pydantic_core import core_schema
-from typing_extensions import NotRequired
-from vllm.inputs.data import TokensPrompt
-from vllm.outputs import CompletionOutput
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import PromptLogprobs, RequestMetrics
-
-
-class Request(BaseModel):
-    prompt: str
-    sampling_params: dict
-
-
-class Tokens(BaseModel):
-    tokens: list[int]
-
-
-class PrefillRequest(Request):
-    request_id: str
-
-
-class Response(BaseModel):
-    text: str
-
-
-class PrefillResponse(BaseModel):
-    prefilled: bool
-
-
-# Hack to override the type of multi_modal_data in TokensPrompt
-# as pydantic doesn't understand generic types
-# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
-# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
-# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
-class PatchedTokensPrompt(TokensPrompt):
-    multi_modal_data: NotRequired[Optional[Any]]  # type: ignore
-
-
-# Monkey-patch the SamplingParams type to add a dummy core schema so pydantic can validate it
-# Sampling params is a mspspec struct
-# SamplingParams is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/sampling_params.py#L88
-
-SamplingParams.__get_pydantic_core_schema__ = classmethod(
-    lambda cls, source, handler: core_schema.any_schema()
-)
-
-
-class vLLMGenerateRequest(BaseModel):
-    """
-    Serializable class of all the fields vLLM engine requires for inference
-    """
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    engine_prompt: PatchedTokensPrompt
-    sampling_params: SamplingParams
-    request_id: str
-    prefix_hit_rate: Optional[float] = 0.0
-
-    @field_validator("sampling_params", mode="before")
-    @classmethod
-    def parse_sampling_params(cls, v: Any) -> SamplingParams:
-        if isinstance(v, str):
-            v = json.loads(v)
-        if isinstance(v, dict):
-            return SamplingParams(**v)
-        return v
-
-    model_config = ConfigDict(
-        json_encoders={SamplingParams: lambda v: msgspec.json.encode(v)}
-    )
-
-
-class TextContent(BaseModel):
-    type: Literal["text"]
-    text: str
-
-
-class ImageURLDetail(BaseModel):
-    url: str
-
-
-class ImageContent(BaseModel):
-    type: Literal["image_url"]
-    image_url: ImageURLDetail
-
-
-MessageContent = Union[TextContent, ImageContent]
-
-
-class ChatMessage(BaseModel):
-    role: Literal["user", "system", "assistant"]
-    content: List[MessageContent]
-
-
-class MultiModalRequest(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    model: str
-    messages: List[ChatMessage]
-    max_tokens: Optional[int] = None
-    temperature: Optional[float] = None
-    stream: Optional[bool] = True
-
-
-class vLLMMultimodalRequest(vLLMGenerateRequest):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    image_url: Optional[str] = None
-    # image_features: Optional[List[List[List[float]]]] = None # Remove once have NIXL support
-    serialized_request: Optional[connect.SerializedRequest] = None
-
-
-class EncodeRequest(BaseModel):
-    """
-    Serializable class of all the fields vLLM engine requires for inference
-    """
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    image_url: str
-    request_id: str
-    serialized_request: Optional[connect.SerializedRequest] = None
-
-
-class EncodeResponse(BaseModel):
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    request_id: str
-    image_grid_thw: Optional[List[Any]] = None
-    image_sizes: Optional[List[Any]] = None
-    serialized_request: Optional[connect.SerializedRequest] = None
-    image_features: List[List[List[float]]]  # Remove once have NIXL support
-
-
-class MyRequestOutput(BaseModel):
-    """
-    RequestOutput from vLLM is not serializable by default
-    https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
-
-    This class is used to serialize the RequestOutput and any recursively defined types
-    We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
-    """
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    request_id: str
-    prompt: Optional[str] = None
-    prompt_token_ids: Optional[List[int]] = None
-    prompt_logprobs: Optional[PromptLogprobs] = None
-    outputs: List[CompletionOutput]
-    finished: bool
-    metrics: Optional[RequestMetrics] = None
-    kv_transfer_params: Optional[dict[str, Any]] = None
-    # lora_request: Optional[LoRARequest] = None
-    # encoder_prompt: Optional[str] = None
-    # encoder_prompt_token_ids: Optional[List[int]] = None
-    # num_cached_tokens: Optional[int] = None
-    # multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None