Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev

96ae75ad · zhuwenwen · f9f4a735 · 2339d59f · 96ae75ad · f9f4a735
Commit 96ae75ad authored Jan 04, 2025 by zhuwenwen
20 changed files
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -19,45 +19,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
-from functools import cached_property, lru_cache
+from functools import cached_property
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
-                    Union)
+                    TypedDict, Union)
-import librosa
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import Qwen2AudioEncoder
+from transformers import BatchFeature, ProcessorMixin
+from transformers.models.qwen2_audio import (Qwen2AudioConfig,
+                                             Qwen2AudioEncoder,
+                                             Qwen2AudioProcessor)
+from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+from vllm.inputs import InputContext
-                         InputContext, token_inputs)
-from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import consecutive_placeholder_ranges
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
-from vllm.sequence import IntermediateTensors, SequenceData
+                                        MultiModalDataItems, ProcessorInputs,
+                                        PromptReplacement)
+from vllm.sequence import IntermediateTensors
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                    maybe_prefix, merge_multimodal_embeddings)
-logger = init_logger(__name__)
 # # === Audio Inputs === #
 class Qwen2AudioInputs(TypedDict):
    input_features: torch.Tensor
-    """Shape: 
+    """Shape: `(num_audios, num_mel_bins, 3000)`"""
-    `(num_audios, num_mel_bins, 3000)`
-    """
    feature_attention_mask: torch.Tensor
-    """Shape: `(num_audios, 3000)`
+    """Shape: `(num_audios, 3000)`"""
-    """
 # === Audio Encoder === #
@@ -74,187 +72,116 @@ class Qwen2AudioMultiModalProjector(nn.Module):
        return hidden_states
-def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int,
+# From Qwen2AudioEncoder._get_feat_extract_output_lengths
-                               mm_counts: Mapping[str, int]):
+def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
-    num_audios = mm_counts["audio"]
+    feat_lengths = (input_lengths - 1) // 2 + 1
-    max_tokens_per_audio = get_max_qwen2_audio_audio_tokens(ctx)
+    output_lengths = (feat_lengths - 2) // 2 + 1
-    max_llm_audio_tokens = max_tokens_per_audio * num_audios
+    return feat_lengths, output_lengths
-    if seq_len - max_llm_audio_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-Audio cannot process {num_audios} audios in a prompt, "
-            "please increase max_model_len or reduce audio limit by "
-            "--limit-mm-per-prompt.")
-    audio_token_index = ctx.model_config.hf_config.audio_token_index
-    dummy_seqdata = SequenceData.from_prompt_token_counts(
-        (audio_token_index, max_llm_audio_tokens),
-        (0, seq_len - max_llm_audio_tokens),
-    )
-    dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.)
-    return DummyData(
-        dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, {
-            "audio":
-            consecutive_placeholder_ranges(num_items=num_audios,
-                                           item_size=max_tokens_per_audio)
-        })
-def get_processor(
-    processor_name: str,
-    *args,
-    trust_remote_code: bool = False,
-    **kwargs,
-):
-    """Gets a processor for the given model name via HuggingFace.
-    Derived from `vllm.transformers_utils.image_processor.get_image_processor`.
-    """
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from transformers import AutoProcessor
-    try:
-        processor = AutoProcessor.from_pretrained(
-            processor_name,
-            *args,
-            trust_remote_code=trust_remote_code,
-            **kwargs)
-    except ValueError as e:
-        # If the error pertains to the processor class not existing or not
-        # currently being imported, suggest using the --trust-remote-code flag.
-        # Unlike AutoTokenizer, AutoProcessor does not separate such errors
-        if not trust_remote_code:
-            err_msg = (
-                "Failed to load the processor. If the processor is "
-                "a custom processor not yet available in the HuggingFace "
-                "transformers library, consider setting "
-                "`trust_remote_code=True` in LLM or using the "
-                "`--trust-remote-code` flag in the CLI.")
-            raise RuntimeError(err_msg) from e
-        else:
-            raise e
-    return processor
+def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
+    hf_config = ctx.get_hf_config(Qwen2AudioConfig)
+    max_source_position = hf_config.audio_config.max_source_positions
+    output_lengths = (max_source_position - 2) // 2 + 1
+    return output_lengths
-cached_get_processor = lru_cache(get_processor)
+class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
-def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
+    def _get_hf_processor(self) -> Qwen2AudioProcessor:
-    """
+        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
-    Computes the output length of the convolutional layers
-    and the output length of the audio encoder
-    """
-    input_lengths = (input_lengths - 1) // 2 + 1
-    output_lengths = (input_lengths - 2) // 2 + 1
-    return input_lengths, output_lengths
+    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
+        return self._get_hf_processor().feature_extractor  # type: ignore
-def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int:
+    def _get_processor_data(
-    max_source_position = (
+        self,
-        ctx.model_config.hf_config.audio_config.max_source_positions)
+        mm_items: MultiModalDataItems,
-    output_lengths = (max_source_position - 2) // 2 + 1
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
-    return output_lengths
+        # resample audio to the model's sampling rate
+        feature_extractor = self._get_feature_extractor()
+        mm_items.resample_audios(feature_extractor.sampling_rate)
+        return super()._get_processor_data(mm_items)
-def input_processor_for_qwen2_audio(
+    def _call_hf_processor(
-        ctx: InputContext, inputs: DecoderOnlyInputs) -> DecoderOnlyInputs:
+        self,
-    multi_modal_data = inputs.get("multi_modal_data")
+        hf_processor: ProcessorMixin,
-    if multi_modal_data is None or "audio" not in multi_modal_data:
+        prompt: str,
-        return inputs
+        processor_data: Mapping[str, object],
+        mm_processor_kwargs: Mapping[str, object],
-    audios = multi_modal_data["audio"]
+    ) -> BatchFeature:
-    if not isinstance(audios, list):
+        processor_data = dict(processor_data)
-        audios = [audios]
+        audios = processor_data.pop("audios", [])
-    if len(audios) == 0:
+        if audios:
-        return inputs
+            processor_data["audios"] = audios
-    processor = cached_get_processor(ctx.model_config.model)
+            feature_extractor = self._get_feature_extractor()
-    resampled_audios = [
+            mm_processor_kwargs = dict(
-        librosa.resample(audio,
+                **mm_processor_kwargs,
-                         orig_sr=sampling_rate,
+                sampling_rate=feature_extractor.sampling_rate,
-                         target_sr=processor.feature_extractor.sampling_rate)
+            )
-        for audio, sampling_rate in audios
+        else:
-    ]
+            # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
-    audio_input_lengths = np.array(
+            pass
-        [min(3000, _.shape[0] // 160 + 1) for _ in resampled_audios])
+        return super()._call_hf_processor(
-    audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
+            hf_processor,
-        audio_input_lengths)
+            prompt=prompt,
+            processor_data=processor_data,
-    audio_token_index = ctx.model_config.hf_config.audio_token_index
+            mm_processor_kwargs=mm_processor_kwargs,
+        )
-    input_ids = inputs['prompt_token_ids']
+    def _get_prompt_replacements(
-    new_input_ids = []
+        self,
-    audio_num = input_ids.count(audio_token_index)
+        mm_items: MultiModalDataItems,
-    assert len(audio_input_lengths) == audio_num, \
+        hf_inputs: BatchFeature,
-        (f'The text input contains {audio_num} audio tokens, '
+        mm_processor_kwargs: Mapping[str, object],
-         f'but {len(audio_input_lengths)} audios provided')
+    ) -> list[PromptReplacement]:
-    start = 0
+        hf_config = self.ctx.get_hf_config(Qwen2AudioConfig)
-    for audio_idx in range(audio_num):
+        placeholder = hf_config.audio_token_index
-        end = input_ids.index(audio_token_index, start)
-        new_input_ids.extend(input_ids[start:end])  # text part
+        feature_attention_mask = hf_inputs.get("feature_attention_mask")
+        if feature_attention_mask is None:
-        new_input_ids.extend([audio_token_index] *
+            audio_output_lengths = []
-                             audio_output_lengths[audio_idx])
+        else:
-        start = end + 1
+            _, audio_output_lengths = _get_feat_extract_output_lengths(
-    new_input_ids.extend(input_ids[start:])
+                feature_attention_mask.sum(-1))
-    return token_inputs(
+        def get_replacement_qwen2_audio(item_idx: int):
-        prompt_token_ids=new_input_ids,
+            return [placeholder] * audio_output_lengths[item_idx]
-        prompt=inputs.get("prompt"),
-        multi_modal_data=multi_modal_data,
+        return [
-    )
+            PromptReplacement(
+                modality="audio",
+                target=[placeholder],
-def input_mapper_for_qwen2_audio(
+                replacement=get_replacement_qwen2_audio,
-    ctx: InputContext,
+            )
-    multi_modal_data: Union[np.ndarray, List[np.ndarray]],
-) -> MultiModalKwargs:
-    """Input mapper for Qwen2-Audio."""
-    if not isinstance(multi_modal_data, list):
-        multi_modal_data = [multi_modal_data]
-    if len(multi_modal_data) == 0:
-        return MultiModalKwargs()
-    processor = cached_get_processor(ctx.model_config.model)
-    audio_feature_extractor = processor.feature_extractor
-    if audio_feature_extractor is None:
-        raise RuntimeError(
-            "No HuggingFace audio_feature_extractor is available "
-            "to process the audio object")
-    try:
-        resampled_audios = [
-            librosa.resample(
-                audio,
-                orig_sr=sampling_rate,
-                target_sr=processor.feature_extractor.sampling_rate)
-            for audio, sampling_rate in multi_modal_data
        ]
-        batch_data = audio_feature_extractor(resampled_audios,
-                                             sampling_rate=16000,
+    def _get_dummy_mm_inputs(
-                                             return_attention_mask=True,
+        self,
-                                             padding="max_length",
+        mm_counts: Mapping[str, int],
-                                             return_tensors="pt").data
+    ) -> ProcessorInputs:
-        batch_data["feature_attention_mask"] = batch_data.pop("attention_mask")
+        feature_extractor = self._get_feature_extractor()
-    except Exception:
+        sampling_rate = feature_extractor.sampling_rate
-        logger.error("Failed to process audio (%s)", multi_modal_data)
+        audio_len = feature_extractor.chunk_length * sampling_rate
-        raise
+        audio_count = mm_counts["audio"]
-    return MultiModalKwargs(batch_data)
+        audio = np.zeros(audio_len)
+        data = {"audio": [audio] * audio_count}
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)
+        return ProcessorInputs(
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_audio)
+            prompt_text="<|AUDIO|>" * audio_count,
-@MULTIMODAL_REGISTRY.register_input_mapper("audio",
+            mm_data=data,
-                                           input_mapper_for_qwen2_audio)
+            mm_processor_kwargs={},
+        )
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
    "audio", get_max_qwen2_audio_audio_tokens)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor)
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
@@ -289,9 +216,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
        return get_sampler()
-    def _validate_and_reshape_mm_tensor(self,
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
-                                        mm_input: Union[torch.Tensor,
-                                                        List[torch.Tensor]],
                                        name: str) -> torch.Tensor:
        if not isinstance(mm_input, (torch.Tensor, list)):
            raise ValueError(f"Incorrect type of {name}. "

--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
-# Adapted from
-# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
-# Copyright 2024 Kakao Corp. (Kanana-X Team)
-# Copyright 2024 The Qwen team.
-# Copyright 2023 The vLLM team.
-"""Inference-only Qwen2-Classification model compatible with HF weights."""
-from typing import Iterable, List, Optional, Set, Tuple
-import torch
-from torch import nn
-from vllm.attention import AttentionMetadata
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.linear import RowParallelLinear
-from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.models.qwen2 import Qwen2Model
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.sequence import IntermediateTensors, PoolerOutput
-from .interfaces import SupportsLoRA, SupportsPP
-from .utils import AutoWeightsLoader, maybe_prefix
-class Qwen2ForSequenceClassification(nn.Module, SupportsLoRA, SupportsPP):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        pooler_config = vllm_config.model_config.pooler_config
-        self.config = config
-        self.lora_config = lora_config
-        self.quant_config = quant_config
-        self.model = Qwen2Model(vllm_config=vllm_config,
-                                prefix=maybe_prefix(prefix, "model"))
-        # hidden_states from Qwen2Model has been reduced,
-        # the input of score layer is not parallelized.
-        self.score = RowParallelLinear(config.hidden_size,
-                                       config.num_labels,
-                                       quant_config=quant_config,
-                                       input_is_parallel=False,
-                                       bias=False,
-                                       prefix=maybe_prefix(prefix, "score"))
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.LAST,
-            normalize=False,
-            softmax=True)
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
-                                   inputs_embeds)
-        logits, _ = self.score(hidden_states)
-        return logits
-    def pooler(
-        self,
-        hidden_states: torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> Optional[PoolerOutput]:
-        return self._pooler(hidden_states, pooling_metadata)
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        loader = AutoWeightsLoader(self,
-                                   ignore_unexpected_prefixes=["lm_head."])
-        return loader.load_weights(weights)
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -22,28 +22,26 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import cached_property, partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
+from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
-                    Optional, Set, Tuple, Type, TypedDict, Union)
+                    Tuple, Type, TypedDict, Union)
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from PIL import Image
-from transformers.image_utils import (get_image_size,
+from transformers import BatchFeature
-                                      infer_channel_dimension_format,
+from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
-                                      to_numpy_array)
+                                          Qwen2VLProcessor)
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
    Qwen2VLConfig, Qwen2VLVisionConfig)
-from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
-    make_batched_images, make_batched_videos, smart_resize)
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+from vllm.inputs import InputContext
-                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import QuickGELU
@@ -56,14 +54,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors
-from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                    MultiModalKwargs, NestedTensors)
+                                        MultiModalDataItems, ProcessorInputs,
-from vllm.multimodal.utils import cached_get_tokenizer
+                                        PromptReplacement)
 from vllm.platforms import _Backend
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
-from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils import is_list_of
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend,
@@ -164,7 +162,7 @@ class Qwen2VisionMLP(nn.Module):
    def __init__(
        self,
        in_features: int,
-        hidden_features: int = None,
+        hidden_features: int,
        act_layer: Type[nn.Module] = QuickGELU,
        quant_config: Optional[QuantizationConfig] = None,
        prefix: str = "",
@@ -693,78 +691,8 @@ class Qwen2VisionTransformer(nn.Module):
 # === Vision input helpers === #
-def get_mm_processor_kwargs(
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None) -> Dict[str, int]:
-    mm_processor_kwargs = {}
-    if min_pixels:
-        mm_processor_kwargs["min_pixels"] = min_pixels
-    if max_pixels:
-        mm_processor_kwargs["max_pixels"] = max_pixels
-    return mm_processor_kwargs
-def mm_input_mapper_for_qwen2_vl(
-    ctx: InputContext,
-    data: MultiModalData[object],
-    data_type_key: str,
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-) -> MultiModalKwargs:
-    """Input mapper for Qwen2-VL."""
-    if data_type_key == "image" and isinstance(data, dict):
-        return MultiModalKwargs({
-            "image_embeds": data.get("image_embeds"),
-            "image_grid_thw": data.get("image_grid_thw"),
-        })
-    if data_type_key == "video" and isinstance(data, dict):
-        return MultiModalKwargs({
-            "video_embeds": data.get("video_embeds"),
-            "video_grid_thw": data.get("video_grid_thw"),
-        })
-    model_config = ctx.model_config
-    # Handle mm processor kwargs; we pass these at creation time
-    # because preprocess() in transformers doesn't expose them
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    image_processor = cached_get_image_processor(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        **mm_processor_kwargs,
-    )
-    if image_processor is None:
-        raise RuntimeError("No HuggingFace processor is available "
-                           "to process the image object")
-    images = None
-    videos = None
-    if data_type_key == "image":
-        images = data
-    else:
-        assert data_type_key == "video"
-        videos = data
-    try:
-        batch_data = image_processor \
-            .preprocess(images=images, videos=videos, return_tensors="pt") \
-            .data
-    except Exception:
-        logger.error("Failed to process image (%s)", data)
-        raise
-    return MultiModalKwargs(batch_data)
-image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
-                                          data_type_key="image")
-video_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
-                                          data_type_key="video")
 def _get_vision_info(
-    image_processor,
+    vision_config: Qwen2VLVisionConfig,
    height: int,
    width: int,
    min_pixels: int,
@@ -775,12 +703,15 @@ def _get_vision_info(
 ):
    """Get information (resized height / width and number of vision tokens)
    of input image / video frame."""
+    patch_size = vision_config.patch_size
+    merge_size = vision_config.spatial_merge_size
+    temporal_patch_size = vision_config.temporal_patch_size
    if do_resize:
        resized_height, resized_width = smart_resize(
            height=height,
            width=width,
-            factor=image_processor.patch_size * image_processor.merge_size,
+            factor=patch_size * merge_size,
            min_pixels=min_pixels,
            max_pixels=max_pixels,
        )
@@ -791,54 +722,41 @@ def _get_vision_info(
        grid_t = mm_count
    else:
        assert data_type_key == "video"
-        grid_t = max(mm_count // image_processor.temporal_patch_size, 1)
+        grid_t = max(mm_count // temporal_patch_size, 1)
-    grid_h = resized_height // image_processor.patch_size
+    grid_h = resized_height // patch_size
-    grid_w = resized_width // image_processor.patch_size
+    grid_w = resized_width // patch_size
    vision_tokens = grid_t * grid_h * grid_w
-    llm_num_vision_tokens = (vision_tokens // image_processor.merge_size //
+    llm_num_vision_tokens = vision_tokens // (merge_size**2)
-                             image_processor.merge_size)
    return resized_height, resized_width, llm_num_vision_tokens
-def _get_max_image_info(
+def _get_image_processor(hf_processor: Qwen2VLProcessor):
-    image_processor,
+    image_processor = hf_processor.image_processor  # type: ignore
-    data_type_key: str = "image",
+    assert isinstance(image_processor, Qwen2VLImageProcessor)
-    mm_count: int = 1,
+    return image_processor
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-):
-    # Limit min / max pixels unless they're explicitly provided
-    if min_pixels is None:
-        min_pixels = max(image_processor.min_pixels, 28 * 28)
-    if max_pixels is None:
-        max_pixels = min(image_processor.max_pixels, 1280 * 28 * 28)
-    return _get_vision_info(
-        image_processor,
-        height=9999999,
-        width=9999999,
-        min_pixels=min_pixels,
-        max_pixels=max_pixels,
-        data_type_key=data_type_key,
-        mm_count=mm_count,
-    )
 def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                               data_type_key: str,
                               *,
-                               min_pixels=None,
+                               min_pixels: Optional[int] = None,
-                               max_pixels=None) -> int:
+                               max_pixels: Optional[int] = None) -> int:
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
+    hf_config = ctx.get_hf_config(Qwen2VLConfig)
-                                                  max_pixels=max_pixels)
+    vision_config = hf_config.vision_config
-    image_processor = cached_get_image_processor(ctx.model_config.model,
-                                                 **mm_processor_kwargs)
+    hf_processor = ctx.get_hf_processor(Qwen2VLProcessor)
-    max_resized_height, max_resized_width, max_llm_image_tokens = \
+    image_processor = _get_image_processor(hf_processor)
-        _get_max_image_info(image_processor, data_type_key=data_type_key,
-                            mm_count=1, min_pixels=min_pixels,
+    _, _, max_llm_image_tokens = _get_vision_info(
-                            max_pixels=max_pixels)
+        vision_config,
+        height=9999999,
+        width=9999999,
+        min_pixels=min_pixels or image_processor.min_pixels,
+        max_pixels=max_pixels or image_processor.max_pixels,
+        data_type_key=data_type_key,
+    )
    return max_llm_image_tokens
@@ -848,290 +766,166 @@ get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens,
                                        data_type_key="video")
-def dummy_data_for_qwen2_vl(
+class Qwen2VLMultiModalDataItems(MultiModalDataItems):
-    ctx: InputContext,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None
-) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
-    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    image_processor = cached_get_image_processor(ctx.model_config.model,
-                                                 **mm_processor_kwargs)
-    num_images = mm_counts["image"]
-    max_resized_height, max_resized_width, max_llm_image_tokens = \
-        _get_max_image_info(image_processor, data_type_key="image",
-                            mm_count=num_images, min_pixels=min_pixels,
-                            max_pixels=max_pixels)
-    if seq_len - max_llm_image_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-VL cannot process {num_images} images in a prompt, "
-            "please increase max_model_len or reduce image limit by "
-            "--limit-mm-per-prompt.")
-    # Check video counts.
-    num_videos = mm_counts["video"]
-    max_resized_height, max_resized_width, max_llm_video_tokens = \
-        _get_max_image_info(image_processor, data_type_key="video",
-                            mm_count=num_videos, min_pixels=min_pixels,
-                            max_pixels=max_pixels)
-    if seq_len - max_llm_video_tokens - 2 < 0:
-        raise RuntimeError(
-            f"Qwen2-VL cannot process {num_videos} videos in a prompt, "
-            "please increase max_model_len or reduce video limit by "
-            "--limit-mm-per-prompt.")
-    hf_config = ctx.get_hf_config(Qwen2VLConfig)
-    dummy_seqdata = SequenceData.from_prompt_token_counts(
-        (hf_config.vision_start_token_id, 1),
-        (hf_config.image_token_id, max_llm_image_tokens),
-        (hf_config.vision_end_token_id, 1),
-        (0, seq_len - max_llm_image_tokens - 2),
-    )
-    dummy_image = Image.new("RGB", (max_resized_width, max_resized_height),
-                            color=0)
-    return DummyData(dummy_seqdata, {
-        "image":
-        dummy_image if num_images == 1 else [dummy_image] * num_images
-    })
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = Qwen2VLMultiModalDataItems()
+        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if (isinstance(v, (dict, torch.Tensor))  # type: ignore[assignment]
+                          or is_list_of(v, list)) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (dict, torch.Tensor, list)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
-def _get_llm_num_vision_tokens(
+        return multi_data
-    mm_inputs: list,
-    data_type_key: str,
-    image_processor,
-    min_pixels: int,
-    max_pixels: int,
-):
-    """Get number of vision tokens of multimodal inputs.
-    This method is derived from `transformers.models.qwen2_vl.
+    def get_item_counts(self) -> Mapping[str, int]:
-    image_processing_qwen2_vl.Qwen2VLImageProcessor._preprocess`.
+        return {
-    """
+            m: (
-    image = to_numpy_array(mm_inputs[0])
+                len(items[f"{m}_grid_thw"])  # type: ignore
-    input_data_format = infer_channel_dimension_format(image)
+                if isinstance(items, dict) else len(items))
-    height, width = get_image_size(image, channel_dim=input_data_format)
+            for m, items in self.items()
+        }
-    _, _, llm_num_vision_tokens = _get_vision_info(
-        image_processor,
-        height=height,
-        width=width,
-        min_pixels=min_pixels,
-        max_pixels=max_pixels,
-        do_resize=image_processor.do_resize,
-        data_type_key=data_type_key,
-        mm_count=len(mm_inputs),
-    )
-    return llm_num_vision_tokens
-def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable,
+class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor):
-                       data_type_key: str, image_processor: Any,
-                       prompt_token_ids: List[int], min_pixels: Optional[int],
-                       max_pixels: Optional[int]) -> List[int]:
-    """
-    Expand pad tokens for multi-modal inputs (e.g., images or videos).
-    Args:
-        inputs (list): The multi-modal inputs (e.g., images or videos).
-        token_id (int): The token ID used to represent the multi-modal input.
-        make_batched_fn (Callable): A function to batch the inputs.
-        data_type_key (str): The type of the multi-modal input.
-        image_processor (Any): The image processor used to process the inputs.
-        prompt_token_ids (List[int]): The list of token IDs in the prompt.
-        min_pixels (int): min pixels to used for img processing
-        max_pixels (int): max pixels to be used for img processing
-    Returns:
-        List[int]: The list of token IDs for the multi-modal inputs.
-    """
-    indices = [
-        idx for idx, token in enumerate(prompt_token_ids) if token == token_id
-    ]
-    inputs = make_batched_fn(inputs)
-    assert len(indices) == len(inputs)
-    prompt_token_ids_with_data = []
-    for cnt, data in enumerate(inputs):
-        num_tokens = _get_llm_num_vision_tokens(
-            [data] if data_type_key == "image" else data,
-            data_type_key=data_type_key,
-            image_processor=image_processor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-        if cnt == 0:
-            end_idx = indices[cnt]
-            non_data_tokens = prompt_token_ids[:end_idx]
-        else:
-            non_data_tokens = prompt_token_ids[indices[cnt - 1] +
-                                               1:indices[cnt]]
-        prompt_token_ids_with_data.extend(non_data_tokens)
-        prompt_token_ids_with_data.extend(token_id for _ in range(num_tokens))
-    prompt_token_ids_with_data.extend(prompt_token_ids[indices[-1] + 1:])
-    return prompt_token_ids_with_data
-def input_processor_for_qwen2_vl(
-    ctx: InputContext,
-    inputs: DecoderOnlyInputs,
-    *,
-    min_pixels: Optional[int] = None,
-    max_pixels: Optional[int] = None,
-) -> DecoderOnlyInputs:
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None:
-        return inputs
-    image_inputs = multi_modal_data.get("image", None)
-    video_inputs = multi_modal_data.get("video", None)
-    processor = cached_get_processor(ctx.model_config.model)
-    image_processor = processor.image_processor
-    # Apply processor kwarg overrides for image processor options
-    min_pixels = min_pixels if min_pixels else image_processor.min_pixels
-    max_pixels = max_pixels if max_pixels else image_processor.max_pixels
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(Qwen2VLConfig)
-    # To avoid redundant processing of vision objects (resize, rescale, etc.),
+    def _get_mm_items(
-    # we extract code of calculating number of vision tokens from
+        self,
-    # `transformers.models.qwen2_vl.processing_qwen2_vl.Qwen2VLProcessor`.
+        mm_data: MultiModalDataDict,
-    #
+    ) -> MultiModalDataItems:
-    # The following code is equivalent to:
+        return Qwen2VLMultiModalDataItems.from_dict(mm_data)
-    #    prompt = inputs["prompt"]
-    #    inputs = processor(text=[prompt],
-    #                       images=image_inputs,
-    #                       videos=video_inputs,
-    #                       padding=True,
-    #                       return_tensors="pt")
-    #    prompt_token_ids = inputs["input_ids"][0].tolist()
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    prompt_token_ids = inputs["prompt_token_ids"]
-    # Expand image pad tokens.
-    if image_inputs is not None:
-        if isinstance(image_inputs, dict):
-            prompt_token_ids_with_image = []
-            image_indices = [
-                idx for idx, token in enumerate(prompt_token_ids)
-                if token == hf_config.image_token_id
-            ]
-            # ensure all image tokens have grid_thw
+    def _get_hf_processor(
-            assert \
+        self,
-                len(image_indices) == image_inputs["image_grid_thw"].size(0), \
+        *,
-                "image token num does not match image_grid_thw.shape"
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
-            image_counter = 0
+    ) -> Qwen2VLProcessor:
-            pad_token_counter = 0
+        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
-            for idx, token in enumerate(prompt_token_ids):
+        image_processor = _get_image_processor(hf_processor)
-                if idx in image_indices:
-                    grid_thw = image_inputs["image_grid_thw"][image_counter]
+        if min_pixels:
-                    grid_t, grid_h, grid_w = grid_thw
+            image_processor.min_pixels = min_pixels
-                    num_pad_tokens = (grid_t * grid_h * grid_w //
+        if max_pixels:
-                                      image_processor.merge_size //
+            image_processor.max_pixels = max_pixels
-                                      image_processor.merge_size)
+        if max_pixels or min_pixels:
-                    prompt_token_ids_with_image.extend([token] *
+            image_processor.size = {
-                                                       num_pad_tokens)
+                "min_pixels": image_processor.min_pixels,
-                    image_counter += 1
+                "max_pixels": image_processor.max_pixels,
-                    pad_token_counter += num_pad_tokens
+            }
+        return hf_processor
+    def _get_processor_data(
+        self,
+        mm_items: MultiModalDataItems,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        processor_data = dict[str, Any]()
+        passthrough_data = dict[str, Any]()
+        for k, v in mm_items.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            if k in ("image", "video", "audio"):
+                if isinstance(v, dict):
+                    # Pass through embedding inputs (dict)
+                    passthrough_data.update(v)
+                elif isinstance(v, torch.Tensor) and v.ndim == 3:
+                    # Pass through embedding inputs (single)
+                    passthrough_data[f"{k}_embeds"] = [v]
+                elif (is_list_of(v, torch.Tensor) and len(v) > 0
+                      and v[0].ndim == 2):
+                    # Pass through embedding inputs (multi)
+                    passthrough_data[f"{k}_embeds"] = v
                else:
-                    prompt_token_ids_with_image.append(token)
+                    # Map keys to plural form, e.g.: image -> images
+                    processor_data[f"{k}s"] = v
+            else:
+                processor_data[k] = v
-            # ensure all embeddings are used
+        return processor_data, passthrough_data
-            assert \
-                pad_token_counter == image_inputs["image_embeds"].size(0), \
-                "image_embeds.shape does not match image_grid_thw"
-            prompt_token_ids = prompt_token_ids_with_image
+    def _get_prompt_replacements(
-        else:
+        self,
-            prompt_token_ids = _expand_pad_tokens(image_inputs,
+        mm_items: MultiModalDataItems,
-                                                  hf_config.image_token_id,
+        hf_inputs: BatchFeature,
-                                                  make_batched_images,
+        mm_processor_kwargs: Mapping[str, object],
-                                                  "image",
+    ) -> list[PromptReplacement]:
-                                                  image_processor,
+        hf_processor = self._get_hf_processor()
-                                                  prompt_token_ids,
+        image_processor = _get_image_processor(hf_processor)
-                                                  min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
+        # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
+        # image_token and video_token registered
-    if video_inputs is not None:
+        placeholder = {
-        if isinstance(video_inputs, dict):
+            "image": hf_processor.image_token,
-            prompt_token_ids_with_video = []
+            "video": hf_processor.video_token,
-            video_indices = [
+        }
-                idx for idx, token in enumerate(prompt_token_ids)
+        merge_length = image_processor.merge_size**2
-                if token == hf_config.video_token_id
-            ]
+        def get_replacement_qwen2vl(item_idx: int, modality: str):
+            grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx]
+            num_tokens = grid_thw.prod() // merge_length
+            return placeholder[modality] * num_tokens
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=placeholder[modality],
+                replacement=partial(get_replacement_qwen2vl,
+                                    modality=modality),
+            ) for modality in ("image", "video")
+        ]
-            # ensure all video tokens have grid_thw
+    def _get_dummy_mm_inputs(
-            assert \
+        self,
-                len(video_indices) == video_inputs["video_grid_thw"].size(0), \
+        mm_counts: Mapping[str, int],
-                "video token num does not match video_grid_thw.shape"
+    ) -> ProcessorInputs:
+        num_images = mm_counts["image"]
-            video_counter = 0
+        hf_processor = self._get_hf_processor()
-            pad_token_counter = 0
+        image_token: str = hf_processor.image_token
-            for idx, token in enumerate(prompt_token_ids):
+        image_processor = _get_image_processor(hf_processor)
-                if idx in video_indices:
-                    grid_thw = video_inputs["video_grid_thw"][video_counter]
+        data = {}
-                    grid_t, grid_h, grid_w = grid_thw
+        resized_height, resized_width = smart_resize(
-                    num_pad_tokens = (grid_t * grid_h * grid_w //
+            height=9999999,
-                                      image_processor.merge_size //
+            width=9999999,
-                                      image_processor.merge_size)
+            factor=image_processor.patch_size * image_processor.merge_size,
-                    prompt_token_ids_with_video.extend([token] *
+            min_pixels=image_processor.min_pixels,
-                                                       num_pad_tokens)
+            max_pixels=image_processor.max_pixels,
-                    video_counter += 1
+        )
-                    pad_token_counter += num_pad_tokens
-                else:
-                    prompt_token_ids_with_video.append(token)
-            # ensure all embeddings are used
+        dummy_image = Image.new("RGB", (resized_width, resized_height),
-            assert \
+                                color=0)
-                pad_token_counter == video_inputs["video_embeds"].size(0), \
+        data["image"] = [dummy_image] * num_images
-                "video_embeds.shape does not match video_grid_thw"
-            prompt_token_ids = prompt_token_ids_with_video
+        return ProcessorInputs(
-        else:
+            prompt_text=image_token * num_images,
-            prompt_token_ids = _expand_pad_tokens(video_inputs,
+            mm_data=data,
-                                                  hf_config.video_token_id,
+            mm_processor_kwargs={},
-                                                  make_batched_videos,
+        )
-                                                  "video",
-                                                  image_processor,
-                                                  prompt_token_ids,
-                                                  min_pixels=min_pixels,
-                                                  max_pixels=max_pixels)
-    prompt = inputs.get("prompt")
-    if prompt is None:
-        prompt = tokenizer.decode(prompt_token_ids)
-    return token_inputs(
-        prompt_token_ids=prompt_token_ids,
-        prompt=prompt,
-        multi_modal_data=multi_modal_data,
-    )
-@MULTIMODAL_REGISTRY.register_image_input_mapper(
-    image_input_mapper_for_qwen2_vl)
-@MULTIMODAL_REGISTRY.register_input_mapper("video",
-                                           video_input_mapper_for_qwen2_vl)
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens)
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
    "video", get_max_qwen2_vl_video_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl)
+@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                                      SupportsLoRA, SupportsPP):
    packed_modules_mapping = {
@@ -1156,10 +950,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
    ]
    embedding_modules = {}
    embedding_padding_modules = []
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "lm_head.": "language_model.lm_head.",
+        "model.": "language_model.model.",
+    })
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
-        config = vllm_config.model_config.hf_config
+        config: Qwen2VLConfig = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config
        multimodal_config = vllm_config.model_config.multimodal_config
@@ -1456,11 +1255,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "lm_head.": "language_model.lm_head.",
-                "model.": "language_model.model.",
-            })
        loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -20,11 +20,10 @@ import torch.nn as nn
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from .adapters import as_embedding_model
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                         supports_cross_encoding, supports_multimodal,
                         supports_pp)
-from .interfaces_base import is_pooling_model, is_text_generation_model
+from .interfaces_base import is_text_generation_model
 logger = init_logger(__name__)
@@ -46,6 +45,7 @@ _TEXT_GENERATION_MODELS = {
    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
    "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
    "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
+    "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
    "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
    "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
    "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
@@ -113,6 +113,7 @@ _EMBEDDING_MODELS = {
    "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
    "GlmForCausalLM": ("glm", "GlmForCausalLM"),
    "GritLM": ("gritlm", "GritLM"),
+    "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
    "LlamaModel": ("llama", "LlamaForCausalLM"),
    **{
        # Multiple models share the same architecture, so we include them all
@@ -124,12 +125,13 @@ _EMBEDDING_MODELS = {
    "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
    "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
-    "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"),  # noqa: E501
    "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
    # [Multimodal]
    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    # [Auto-converted (see adapters.py)]
+    "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
 }
 _CROSS_ENCODER_MODELS = {
@@ -225,19 +227,10 @@ class _ModelInfo:
    @staticmethod
    def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
-        is_pooling_model_ = is_pooling_model(model)
-        if not is_pooling_model_:
-            try:
-                as_embedding_model(model)
-            except Exception:
-                pass
-            else:
-                is_pooling_model_ = True
        return _ModelInfo(
            architecture=model.__name__,
            is_text_generation_model=is_text_generation_model(model),
-            is_pooling_model=is_pooling_model_,
+            is_pooling_model=True,  # Can convert any model into a pooling model
            supports_cross_encoding=supports_cross_encoding(model),
            supports_multimodal=supports_multimodal(model),
            supports_pp=supports_pp(model),

--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -31,6 +31,19 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
 class TeleChat2Model(LlamaModel):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "transformer.": "model.",
+        },
+        orig_to_new_substr={
+            ".h.": ".layers.",
+            ".self_attention.": ".self_attn.",
+            ".word_embeddings.": ".embed_tokens.",
+            ".dense.": ".o_proj.",
+            ".ln_f.": ".norm.",
+        },
+    )
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        # 1. Initialize the LlamaModel with bias
        vllm_config.model_config.hf_config.bias = True
@@ -111,21 +124,9 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={
-                "transformer.": "model.",
-            },
-            orig_to_new_substr={
-                ".h.": ".layers.",
-                ".self_attention.": ".self_attn.",
-                ".word_embeddings.": ".embed_tokens.",
-                ".dense.": ".o_proj.",
-                ".ln_f.": ".norm.",
-            },
-        )
        loader = AutoWeightsLoader(
            self,
            skip_prefixes=(["lm_head."]
                           if self.config.tie_word_embeddings else None),
        )
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,7 +3,7 @@
 import math
 from functools import cached_property, lru_cache
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
                    Tuple, TypedDict, Union)
 import numpy as np
@@ -11,7 +11,7 @@ import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
-from transformers import BatchFeature
+from transformers import BatchFeature, ProcessorMixin
 from transformers.models.whisper import WhisperFeatureExtractor
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
@@ -25,11 +25,11 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        MultiModalDataDict,
                                        MultiModalDataItems, ProcessorInputs,
                                        PromptReplacement)
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+from vllm.utils import is_list_of
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
@@ -61,8 +61,8 @@ def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor:
 def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor:
-    return cached_feature_extractor(
+    hf_config = ctx.get_hf_config(UltravoxConfig)
-        ctx.get_hf_config(UltravoxConfig).audio_model_id)
+    return cached_feature_extractor(hf_config.audio_model_id)
 def get_ultravox_max_audio_tokens(ctx: InputContext):
@@ -73,72 +73,71 @@ def get_ultravox_max_audio_tokens(ctx: InputContext):
 class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
    def _get_feature_extractor(self) -> WhisperFeatureExtractor:
-        return self._get_hf_processor().audio_processor.feature_extractor
+        hf_processor = self._get_hf_processor()
+        return hf_processor.audio_processor.feature_extractor  # type: ignore
-    def _resample_audio(
+    def _get_processor_data(
        self,
-        audio: np.ndarray,
+        mm_items: MultiModalDataItems,
-        sr: int,
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
-    ) -> Dict[str, Union[np.ndarray, int]]:
        # resample audio to the model's sampling rate
        feature_extractor = self._get_feature_extractor()
-        if sr != feature_extractor.sampling_rate:
+        mm_items.resample_audios(feature_extractor.sampling_rate)
-            try:
-                import librosa
+        return super()._get_processor_data(mm_items)
-            except ImportError as exc:
-                raise ImportError(
+    def _call_hf_processor(
-                    "Please install vllm[audio] for audio support.") from exc
-            audio = librosa.resample(audio,
-                                     orig_sr=sr,
-                                     target_sr=feature_extractor.sampling_rate)
-            sr = feature_extractor.sampling_rate
-        return {"audio": audio, "sampling_rate": sr}
-    def _apply_hf_processor(
        self,
+        hf_processor: ProcessorMixin,
        prompt: str,
-        mm_data: MultiModalDataDict,
+        processor_data: Mapping[str, object],
        mm_processor_kwargs: Mapping[str, object],
    ) -> BatchFeature:
-        if not mm_data or not mm_data.get("audio", None):
+        processor_data = dict(processor_data)
-            return super()._apply_hf_processor(prompt, mm_data,
+        audios = processor_data.pop("audios", [])
-                                               mm_processor_kwargs)
+        if not audios:
+            return super()._call_hf_processor(
+                hf_processor,
+                prompt=prompt,
+                processor_data=processor_data,
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+        feature_extractor = self._get_feature_extractor()
+        mm_processor_kwargs = dict(
+            **mm_processor_kwargs,
+            sampling_rate=feature_extractor.sampling_rate,
+        )
-        audio_data = mm_data["audio"]
+        # Already resampled by _get_processor_data
-        if not isinstance(audio_data, list):
+        assert is_list_of(audios, np.ndarray)
-            audio_data = [audio_data]
        # Ultravox processor doesn't support multiple inputs,
        # therefore we need to input text and audio one by one
-        tokenizer = self._get_tokenizer()
        audio_features, audio_token_len = [], []
-        processed_inputs = {}
+        shared_outputs = {}
-        for audio, sr in audio_data:
+        for audio in audios:
-            data = self._resample_audio(audio, sr)
+            # NOTE: Ultravox processor accepts "audio" instead of "audios"
-            processed_inputs = super()._apply_hf_processor(
+            item_processor_data = dict(**processor_data, audio=audio)
-                prompt, data, mm_processor_kwargs)
-            prompt = tokenizer.decode(processed_inputs["input_ids"][0],
+            item_outputs = super()._call_hf_processor(
-                                      skip_special_tokens=False)
+                hf_processor,
-            audio_features.append(
+                prompt=prompt,
-                processed_inputs.pop("audio_values").squeeze(0))
+                processor_data=item_processor_data,
-            audio_token_len.append(
+                mm_processor_kwargs=mm_processor_kwargs,
-                processed_inputs.pop("audio_token_len").item())
+            )
-        return dict(
+            audio_features.append(item_outputs.pop("audio_values")[0])
-            **processed_inputs,
+            audio_token_len.append(item_outputs.pop("audio_token_len").item())
+            shared_outputs = item_outputs
+        combined_outputs = dict(
+            **shared_outputs,
            audio_features=audio_features,
            audio_token_len=audio_token_len,
        )
+        return BatchFeature(combined_outputs)
-    def _get_processor_data(
-        self,
-        mm_data: MultiModalDataDict,
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        # Ultravox uses "audio" instead of "audios" as calling keyword
-        processor_data, passthrough_data = super()._get_processor_data(mm_data)
-        if "audios" in processor_data:
-            processor_data["audio"] = processor_data.pop("audios")
-        return processor_data, passthrough_data
    def _get_prompt_replacements(
        self,
@@ -147,7 +146,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
        mm_processor_kwargs: Mapping[str, object],
    ) -> list[PromptReplacement]:
        hf_processor = self._get_hf_processor()
-        placeholder = hf_processor.audio_token_replacement
+        placeholder = hf_processor.audio_token_replacement  # type: ignore
        def get_replacement_ultravox(item_idx: int):
            audio_token_len = hf_inputs["audio_token_len"][item_idx]
@@ -171,7 +170,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor):
        audio_count = mm_counts["audio"]
        audio = np.zeros(audio_len)
-        data = {"audio": [(audio, sampling_rate)] * audio_count}
+        data = {"audio": [audio] * audio_count}
        return ProcessorInputs(
            prompt_text="<|audio|>" * audio_count,
@@ -303,6 +302,9 @@ class ModifiedWhisperEncoder(WhisperEncoder):
 @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
@@ -495,9 +497,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
    def load_weights(self, weights: Iterable[Tuple[str,
                                                   torch.Tensor]]) -> Set[str]:
-        hf_to_vllm_mapper = WeightsMapper(
-            orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
        loader = AutoWeightsLoader(self,
                                   ignore_unexpected_prefixes=["audio_tower."])
-        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
\ No newline at end of file
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -328,6 +328,15 @@ class PackedvLLMParameter(ModelWeightParameter):
            marlin_tile_size=self.marlin_tile_size)
+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+    pass
 def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
                          output_dim: int, **kwargs) -> BasevLLMParameter:
    """

--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -11,7 +11,7 @@ The global :class:`~MultiModalRegistry` is used by model runners to
 dispatch data processing according to its modality and the target model.
 See also:
-    :ref:`input_processing_pipeline`
+    :ref:`input-processing-pipeline`
 """
 __all__ = [

--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
+import numpy as np
+import numpy.typing as npt
 from vllm.inputs.registry import InputContext
+from vllm.utils import PlaceholderModule
 from .base import MultiModalPlugin
 from .inputs import AudioItem, MultiModalData, MultiModalKwargs
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 class AudioPlugin(MultiModalPlugin):
    """Plugin for audio data."""
@@ -21,3 +30,12 @@ class AudioPlugin(MultiModalPlugin):
    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
        raise NotImplementedError(
            "There is no default maximum multimodal tokens")
+def resample_audio(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -50,7 +50,7 @@ class MultiModalPlugin(ABC):
    (i.e., the modality of the data).
    See also:
-        :ref:`adding_multimodal_plugin`
+        :ref:`adding-multimodal-plugin`
    """
    def __init__(self) -> None:
@@ -94,8 +94,8 @@ class MultiModalPlugin(ABC):
        If `None` is provided, then the default input mapper is used instead.
        See also:
-            - :ref:`input_processing_pipeline`
+            - :ref:`input-processing-pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`enabling-multimodal-inputs`
        """
        def wrapper(model_cls: N) -> N:
@@ -130,8 +130,8 @@ class MultiModalPlugin(ABC):
            TypeError: If the data type is not supported.
        See also:
-            - :ref:`input_processing_pipeline`
+            - :ref:`input-processing-pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`enabling-multimodal-inputs`
        """
        # Avoid circular import
@@ -190,7 +190,7 @@ class MultiModalPlugin(ABC):
        If `None` is provided, then the default calculation is used instead.
        See also:
-            :ref:`enabling_multimodal_inputs`
+            :ref:`enabling-multimodal-inputs`
        """
        def wrapper(model_cls: N) -> N:
@@ -222,7 +222,7 @@ class MultiModalPlugin(ABC):
        The model is identified by ``model_config``.
        See also:
-            :ref:`enabling_multimodal_inputs`
+            :ref:`enabling-multimodal-inputs`
        """
        # Avoid circular import
        from vllm.model_executor.model_loader import get_model_architecture

--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin):
    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
        return 3000
+def rescale_image_size(image: Image.Image,
+                       size_factor: float,
+                       transpose: int = -1) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    image = image.resize((new_width, new_height))
+    if transpose >= 0:
+        image = image.transpose(Image.Transpose(transpose))
+    return image
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -15,31 +15,32 @@ _T = TypeVar("_T")
 # yapf: disable
 ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
 """
-A :class:`transformers.image_utils.ImageInput` representing a single image,
+A :class:`transformers.image_utils.ImageInput` representing a single image
-which can be passed to a HuggingFace :code:`ImageProcessor`.
+item, which can be passed to a HuggingFace :code:`ImageProcessor`.
 """
 VideoItem: TypeAlias = Union[
-    List[Image],
+    list[Image],
    np.ndarray,
    torch.Tensor,
-    List[np.ndarray],
+    list[np.ndarray],
-    List[torch.Tensor],
+    list[torch.Tensor],
 ]
 """
+A :class:`transformers.image_utils.VideoInput` representing a single video
-A :class:`transformers.image_utils.VideoInput` representing a single video,
+item, which can be passed to a HuggingFace :code:`VideoProcessor`.
-which can be passed to a HuggingFace :code:`VideoProcessor`.
 """
 AudioItem: TypeAlias = Union[
    np.ndarray,
-    List[float],
+    list[float],
-    Tuple[np.ndarray, float],  # DEPRECATED: Use mm_processor_kwargs instead
+    # `(audio, sampling_rate)`: If the audio's sampling rate is different
+    # from that expected by the model, we need to resample it.
+    tuple[np.ndarray, float],
 ]
 """
-Represents a single audio that can be inputted to a HuggingFace
+Represents a single audio
-:code:`AudioProcessor`.
+item, which can be passed to a HuggingFace :code:`AudioProcessor`.
 """
 # yapf: enable
@@ -74,7 +75,7 @@ Note:
    This dictionary also accepts modality keys defined outside
    :class:`MultiModalDataBuiltins` as long as a customized plugin
    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding_multimodal_plugin>`.
+    Read more on that :ref:`here <adding-multimodal-plugin>`.
 """
@@ -215,6 +216,9 @@ class MultiModalInputsV2(TypedDict):
    mm_kwargs: MultiModalKwargs
    """Keyword arguments to be directly passed to the model after batching."""
+    mm_hashes: NotRequired[List[str]]
+    """The hashes of the multi-modal data."""
    mm_placeholders: MultiModalPlaceholderDict
    """
    For each modality, information about the placeholder tokens in

--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -17,6 +17,7 @@ from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import flatten_2d_lists, full_groupby, is_list_of
+from .audio import resample_audio
 from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
                     VideoItem)
@@ -30,7 +31,7 @@ _PromptSeq = Union[str, list[int]]
 @dataclass
 class PromptReplacement:
    modality: str
-    """The modality for which the replacement is made"""
+    """The modality for which the replacement is made."""
    target: _PromptSeq
    """The text or token sequence to find and replace."""
@@ -211,20 +212,54 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
    corresponds to a list.
    """
+    @staticmethod
+    def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems":
+        """
+        Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+        """
+        multi_data = MultiModalDataItems()
+        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
+            # yapf: disable
+            if k == "video":
+                # Special case since even a single item can be a list
+                multi_data[k] = (  # type: ignore[index]
+                    v if (isinstance(v, torch.Tensor)
+                          or is_list_of(v, list)) else [v]
+                )
+            elif k in ("image", "audio"):
+                multi_data[k] = (  # type: ignore[index]
+                    v if isinstance(v, (torch.Tensor, list)) else [v]
+                )
+            else:
+                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+            # yapf: enable
+        return multi_data
+    # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to
+    # `self.images` doesn't update this dictionary, which may be confusing
+    # We annotate the getter methods as `Sequence` to prevent others from
+    # trying to update the list in this way
    @property
-    def image(self) -> list[ImageItem]:
+    def images(self) -> Sequence[ImageItem]:
-        return self["image"]
+        return self.get("image", [])
    @property
-    def video(self) -> list[VideoItem]:
+    def videos(self) -> Sequence[VideoItem]:
-        return self["video"]
+        return self.get("video", [])
    @property
-    def audio(self) -> list[AudioItem]:
+    def audios(self) -> Sequence[AudioItem]:
-        return self["audio"]
+        return self.get("audio", [])
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {m: len(items) for m, items in self.items()}
    def get_image_size(self, item_idx: int) -> ImageSize:
-        image = self.image[item_idx]
+        image = self.images[item_idx]
        if isinstance(image, Image):
            return ImageSize(*image.size)
@@ -234,25 +269,41 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
        assert_never(image)
+    def get_audio_with_sr(
+        self,
+        item_idx: int,
+        *,
+        default_sr: float,
+    ) -> tuple[np.ndarray, float]:
+        audio = self.audios[item_idx]
+        if isinstance(audio, tuple):
+            return audio
+        if isinstance(audio, list):
+            return np.array(audio), default_sr
+        if isinstance(audio, np.ndarray):
+            return audio, default_sr
+        assert_never(audio)
+    def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None:
+        """
+        If :code:`drop_sr=True`, the audio items in this dictionary are updated
+        to be NumPy arrays which implicitly means that their sampling rate is
+        the same as the model's expected sampling rate; otherwise, they remain
+        as :code:`(audio, new_sr)` tuples.
+        """
+        if not self.audios:
+            return
-def to_multi_format(data: MultiModalDataDict) -> MultiModalDataItems:
+        new_audios = []
-    """
+        for item_idx in range(len(self.audios)):
-    Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`.
+            audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr)
-    """
+            audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr)
-    multi_data = MultiModalDataItems()
+            new_audios.append(audio if drop_sr else (audio, new_sr))
-    for k, v in data.items():
-        # yapf: disable
-        if k == "video":
-            # Special case since even a single item can be a list
-            multi_data[k] = v if is_list_of(v, list) else [v]  # type: ignore[index]
-        elif k in ("image", "audio"):
-            multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-        else:
-            multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
-        # yapf: enable
-    return multi_data
+        self["audio"] = new_audios
 class _TokenMatch(NamedTuple):
@@ -567,6 +618,12 @@ class BaseMultiModalProcessor(ABC):
    def _get_tokenizer(self) -> AnyTokenizer:
        return self.ctx.tokenizer
+    def _get_mm_items(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> MultiModalDataItems:
+        return MultiModalDataItems.from_dict(mm_data)
    @abstractmethod
    def _get_prompt_replacements(
        self,
@@ -596,18 +653,20 @@ class BaseMultiModalProcessor(ABC):
    def _get_processor_data(
        self,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
-    ) -> BatchFeature:
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
        processor_data = dict[str, Any]()
        passthrough_data = dict[str, Any]()
-        for k, v in mm_data.items():
+        for k, v in mm_items.items():
            # TODO: Make a separate modality for embedding inputs
            # to avoid confusion
            if k in ("image", "video", "audio"):
                if isinstance(v, torch.Tensor) and v.ndim == 3:
                    # Pass through embedding inputs (single)
                    passthrough_data[f"{k}_embeds"] = [v]
-                elif is_list_of(v, torch.Tensor) and v[0].ndim == 2:
+                elif (is_list_of(v, torch.Tensor) and len(v) > 0
+                      and v[0].ndim == 2):
                    # Pass through embedding inputs (multi)
                    passthrough_data[f"{k}_embeds"] = v
                else:
@@ -615,40 +674,41 @@ class BaseMultiModalProcessor(ABC):
                    processor_data[f"{k}s"] = v
            else:
                processor_data[k] = v
        return processor_data, passthrough_data
+    def _call_hf_processor(
+        self,
+        hf_processor: ProcessorMixin,
+        prompt: str,
+        processor_data: Mapping[str, object],
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.ctx.call_hf_processor(
+            hf_processor,
+            prompt,
+            processor_data,
+            mm_processor_kwargs,
+        )
    def _apply_hf_processor(
        self,
        prompt: str,
-        mm_data: MultiModalDataDict,
+        mm_items: MultiModalDataItems,
        mm_processor_kwargs: Mapping[str, object],
    ) -> BatchFeature:
        # some mm_processor_kwargs may be used in processor initialization
        # instead of processor call
        hf_processor = self._get_hf_processor(**mm_processor_kwargs)
-        processor_data, passthrough_data = self._get_processor_data(mm_data)
+        processor_data, passthrough_data = self._get_processor_data(mm_items)
-        assert callable(hf_processor)
+        hf_inputs = self._call_hf_processor(
-        mm_processor_kwargs = self.ctx.resolve_hf_processor_call_kwargs(
            hf_processor,
-            mm_processor_kwargs,
+            prompt=prompt,
+            processor_data=processor_data,
+            mm_processor_kwargs=mm_processor_kwargs,
        )
-        try:
-            hf_inputs = hf_processor(
-                text=prompt,  # type: ignore
-                **processor_data,
-                **mm_processor_kwargs,
-                return_tensors="pt",
-            )
-        except Exception as exc:
-            data = dict(text=prompt, **processor_data)
-            raise RuntimeError(
-                f"Failed to apply {type(hf_processor).__name__} "
-                f"on data={data} with kwargs={mm_processor_kwargs}") from exc
        hf_inputs.update(passthrough_data)
        return hf_inputs
@@ -730,25 +790,25 @@ class BaseMultiModalProcessor(ABC):
        3. Extract information about the placeholder tokens from the
           processed token IDs.
        """
-        tokenizer = self._get_tokenizer()
+        mm_items = self._get_mm_items(mm_data)
-        hf_inputs = self._apply_hf_processor(prompt_text, mm_data,
+        hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
                                             mm_processor_kwargs)
        prompt_ids, = hf_inputs.pop("input_ids").tolist()
        mm_kwargs = MultiModalKwargs(hf_inputs)
-        mm_items = to_multi_format(mm_data)
        prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs,
                                                     mm_processor_kwargs)
        all_prompt_repls = self._bind_prompt_replacements(prompt_repls)
        # If HF processor already inserts placeholder tokens,
        # there is no need for us to insert them
-        mm_item_counts = {m: len(items) for m, items in mm_items.items()}
+        mm_item_counts = mm_items.get_item_counts()
        all_placeholders = self._find_placeholders(all_prompt_repls,
                                                   prompt_ids, mm_item_counts)
        if all_placeholders:
+            tokenizer = self._get_tokenizer()
            prompt_text = _decode(tokenizer, prompt_ids)
        else:
            (

--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -76,7 +76,7 @@ class MultiModalRegistry:
        Register a multi-modal plugin so it can be recognized by vLLM.
        See also:
-            :ref:`adding_multimodal_plugin`
+            :ref:`adding-multimodal-plugin`
        """
        data_type_key = plugin.get_data_key()
@@ -311,8 +311,8 @@ class MultiModalRegistry:
        invoked to transform the data into a dictionary of model inputs.
        See also:
-            - :ref:`input_processing_pipeline`
+            - :ref:`input-processing-pipeline`
-            - :ref:`enabling_multimodal_inputs`
+            - :ref:`enabling-multimodal-inputs`
        """
        def wrapper(model_cls: N) -> N:

--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -2,7 +2,7 @@ import base64
 import os
 from functools import lru_cache
 from io import BytesIO
-from typing import Any, List, Optional, Tuple, TypeVar, Union
+from typing import List, Optional, Tuple, TypeVar, Union
 import numpy as np
 import numpy.typing as npt
@@ -14,9 +14,25 @@ import vllm.envs as envs
 from vllm.connections import global_http_connection
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.utils import PlaceholderModule
 from .inputs import MultiModalDataDict, PlaceholderRange
+try:
+    import decord
+except ImportError:
+    decord = PlaceholderModule("decord")  # type: ignore[assignment]
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
 logger = init_logger(__name__)
 cached_get_tokenizer = lru_cache(get_tokenizer)
@@ -138,19 +154,7 @@ async def async_fetch_image(image_url: str,
    return image.convert(image_mode)
-def _load_video_frames_from_bytes(b: bytes):
+def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
-    frame = Image.open(BytesIO(b))
-    return np.array(frame)
-def load_video_frames_from_base64(frame: Union[bytes, str]):
-    """Load frame from base64 format."""
-    return _load_video_frames_from_bytes(base64.b64decode(frame))
-def _load_video_from_bytes(b: bytes, num_frames: int = 32):
-    _, decord = try_import_video_packages()
    video_path = BytesIO(b)
    vr = decord.VideoReader(video_path, num_threads=1)
    total_frame_num = len(vr)
@@ -168,13 +172,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
    return frames
-def _load_video_from_data_url(video_url: str):
+def _load_video_from_data_url(video_url: str) -> npt.NDArray:
-    # Only split once and assume the second part is the base64 encoded image
+    # Only split once and assume the second part is the base64 encoded video
-    frames_base64 = video_url.split(",")[1:]
+    _, video_base64 = video_url.split(",", 1)
-    return np.stack([
-        load_video_frames_from_base64(frame_base64)
+    if video_url.startswith("data:video/jpeg;"):
-        for frame_base64 in frames_base64
+        return np.stack([
-    ])
+            np.array(load_image_from_base64(frame_base64))
+            for frame_base64 in video_base64.split(",")
+        ])
+    return load_video_from_base64(video_base64)
 def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
@@ -217,22 +225,10 @@ async def async_fetch_video(video_url: str,
    return video
-def try_import_audio_packages() -> Tuple[Any, Any]:
-    try:
-        import librosa
-        import soundfile
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[audio] for audio support.") from exc
-    return librosa, soundfile
 def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
    """
    Load audio from a URL.
    """
-    librosa, _ = try_import_audio_packages()
    if audio_url.startswith("http"):
        audio_bytes = global_http_connection.get_bytes(
            audio_url,
@@ -253,8 +249,6 @@ async def async_fetch_audio(
    """
    Asynchronously fetch audio from a URL.
    """
-    librosa, _ = try_import_audio_packages()
    if audio_url.startswith("http"):
        audio_bytes = await global_http_connection.async_get_bytes(
            audio_url,
@@ -313,8 +307,6 @@ def encode_audio_base64(
    sampling_rate: int,
 ) -> str:
    """Encode audio as base64."""
-    _, soundfile = try_import_audio_packages()
    buffered = BytesIO()
    soundfile.write(buffered, audio, sampling_rate, format="WAV")
@@ -343,61 +335,7 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
    return _load_image_from_bytes(base64.b64decode(image))
-def rescale_image_size(image: Image.Image,
+def encode_video_base64(frames: npt.NDArray) -> str:
-                       size_factor: float,
-                       transpose: int = -1) -> Image.Image:
-    """Rescale the dimensions of an image by a constant factor."""
-    new_width = int(image.width * size_factor)
-    new_height = int(image.height * size_factor)
-    image = image.resize((new_width, new_height))
-    if transpose >= 0:
-        image = image.transpose(Image.Transpose(transpose))
-    return image
-def try_import_video_packages() -> Any:
-    try:
-        import cv2
-        import decord
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[video] for video support.") from exc
-    return cv2, decord
-def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-    num_frames, _, _, channels = frames.shape
-    new_height, new_width = size
-    resized_frames = np.empty((num_frames, new_height, new_width, channels),
-                              dtype=frames.dtype)
-    for i, frame in enumerate(frames):
-        resized_frame = cv2.resize(frame, (new_width, new_height))
-        resized_frames[i] = resized_frame
-    return resized_frames
-def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
-    _, height, width, _ = frames.shape
-    new_height = int(height * size_factor)
-    new_width = int(width * size_factor)
-    return resize_video(frames, (new_height, new_width))
-def sample_frames_from_video(frames: npt.NDArray,
-                             num_frames: int) -> npt.NDArray:
-    total_frames = frames.shape[0]
-    if num_frames == -1:
-        return frames
-    else:
-        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
-        sampled_frames = frames[frame_indices, ...]
-        return sampled_frames
-def encode_video_base64(frames: npt.NDArray):
    base64_frames = []
    frames_list = [frames[i] for i in range(frames.shape[0])]
    for frame in frames_list:
@@ -406,6 +344,11 @@ def encode_video_base64(frames: npt.NDArray):
    return ",".join(base64_frames)
+def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
+    """Load video from base64 format."""
+    return _load_video_from_bytes(base64.b64decode(video))
 def resolve_visual_encoder_outputs(
    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
    feature_sample_layers: Optional[list[int]],

--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Dict, Optional
+import cv2
 import numpy as np
+import numpy.typing as npt
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
@@ -75,3 +77,33 @@ class VideoPlugin(ImagePlugin):
    def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
        return 4096
+def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+    return resize_video(frames, (new_height, new_width))
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    sampled_frames = frames[frame_indices, ...]
+    return sampled_frames
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -355,7 +355,8 @@ class PoolingRequestOutput(Generic[_O]):
        pooled_data = seq_group.pooled_data
        assert pooled_data is not None
-        output = PoolingOutput(pooled_data)
+        data = pooled_data.to(dtype=torch.float32, device="cpu")
+        output = PoolingOutput(data)
        prompt_token_ids = seq_group.prompt_token_ids
        finished = seq_group.is_finished()

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -54,7 +54,7 @@ class CpuPlatform(Platform):
        import vllm.envs as envs
        from vllm.utils import GiB_bytes
        model_config = vllm_config.model_config
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
        # If the feature combo become valid
        if not model_config.enforce_eager:
            logger.warning(

--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -165,7 +165,7 @@ def main():
        required=False,
        help="Read CLI options from a config file."
        "Must be a YAML with the following options:"
-        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server"
+        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
    )
    serve_parser = make_arg_parser(serve_parser)
    serve_parser.set_defaults(dispatch_function=serve)

--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -113,7 +113,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
    return spec_decode_worker
-# Reminder: Please update docs/source/usage/compatibility_matrix.rst
+# Reminder: Please update docs/source/usage/compatibility_matrix.md
 # If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
    """Worker which implements speculative decoding.