Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

0da93439 · zhuwenwen · 25f2f756 · 298e5108 · 0da93439 · 0da93439
Commit 0da93439 authored Mar 26, 2026 by zhuwenwen
20 changed files
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -124,8 +124,8 @@ _TEXT_GENERATION_MODELS = {
    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
    "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
    "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
-    "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),  # noqa: E501
-    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),  # noqa: E501
+    "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),
+    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),
    "GritLM": ("gritlm", "GritLM"),
    "Grok1ModelForCausalLM": ("grok1", "GrokForCausalLM"),
    "Grok1ForCausalLM": ("grok1", "GrokForCausalLM"),
@@ -143,7 +143,7 @@ _TEXT_GENERATION_MODELS = {
    "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
    "Jais2ForCausalLM": ("jais2", "Jais2ForCausalLM"),
    "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
-    "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"),  # noqa: E501
+    "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"),
    "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"),
    "Lfm2MoeForCausalLM": ("lfm2_moe", "Lfm2MoeForCausalLM"),
    "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
@@ -249,17 +249,14 @@ _EMBEDDING_MODELS = {
    # [Multimodal]
    "CLIPModel": ("clip", "CLIPEmbeddingModel"),
    "ColPaliForRetrieval": ("colpali", "ColPaliModel"),
+    "LlamaNemotronVLModel": ("nemotron_vl", "LlamaNemotronVLForEmbedding"),
    "LlavaNextForConditionalGeneration": (
        "llava_next",
        "LlavaNextForConditionalGeneration",
    ),
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
-    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
    "SiglipModel": ("siglip", "SiglipEmbeddingModel"),
-    "LlamaNemotronVLModel": (
-        "nemotron_vl",
-        "LlamaNemotronVLForEmbedding",
-    ),
    # Technically Terratorch models work on images, both in
    # input and output. I am adding it here because it piggy-backs on embedding
    # models for the time being.
@@ -272,10 +269,13 @@ _LATE_INTERACTION_MODELS = {
    "HF_ColBERT": ("colbert", "ColBERTModel"),
    "ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
    "ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
+    "ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"),
    # [Multimodal]
    "ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
+    "ColPaliForRetrieval": ("colpali", "ColPaliModel"),
    "ColQwen3": ("colqwen3", "ColQwen3Model"),
    "OpsColQwen3Model": ("colqwen3", "ColQwen3Model"),
+    "ColQwen3_5": ("colqwen3_5", "ColQwen3_5Model"),
    "Qwen3VLNemotronEmbedModel": ("colqwen3", "ColQwen3Model"),
 }

@@ -302,7 +302,7 @@ _SEQUENCE_CLASSIFICATION_MODELS = {
        "bert_with_rope",
        "GteNewForSequenceClassification",
    ),
-    "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),  # noqa: E501
+    "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"),
    "LlamaBidirectionalForSequenceClassification": (
        "llama",
        "LlamaBidirectionalForSequenceClassification",
@@ -366,13 +366,13 @@ _MULTIMODAL_MODELS = {
        "fireredasr2",
        "FireRedASR2ForConditionalGeneration",
    ),
-    "FunASRForConditionalGeneration": ("funasr", "FunASRForConditionalGeneration"),  # noqa: E501
+    "FunASRForConditionalGeneration": ("funasr", "FunASRForConditionalGeneration"),
    "FunAudioChatForConditionalGeneration": (
        "funaudiochat",
        "FunAudioChatForConditionalGeneration",
    ),
    "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
-    "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
+    "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),
    "Gemma3nForConditionalGeneration": (
        "gemma3n_mm",
        "Gemma3nForConditionalGeneration",
@@ -381,7 +381,7 @@ _MULTIMODAL_MODELS = {
    "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
    "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),
    "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"),
-    "GlmOcrForConditionalGeneration": ("glm_ocr", "GlmOcrForConditionalGeneration"),  # noqa: E501
+    "GlmOcrForConditionalGeneration": ("glm_ocr", "GlmOcrForConditionalGeneration"),
    "GraniteSpeechForConditionalGeneration": (
        "granite_speech",
        "GraniteSpeechForConditionalGeneration",
@@ -391,13 +391,7 @@ _MULTIMODAL_MODELS = {
        "hunyuan_vision",
        "HunYuanVLForConditionalGeneration",
    ),
-    "StepVLForConditionalGeneration": ("step_vl", "StepVLForConditionalGeneration"),
    "InternVLChatModel": ("internvl", "InternVLChatModel"),
-    "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
-    "OpenCUAForConditionalGeneration": (
-        "opencua",
-        "OpenCUAForConditionalGeneration",
-    ),
    "InternS1ForConditionalGeneration": (
        "interns1",
        "InternS1ForConditionalGeneration",
@@ -415,24 +409,22 @@ _MULTIMODAL_MODELS = {
        "Idefics3ForConditionalGeneration",
    ),
    "IsaacForConditionalGeneration": ("isaac", "IsaacForConditionalGeneration"),
-    "SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"),  # noqa: E501
    "KananaVForConditionalGeneration": ("kanana_v", "KananaVForConditionalGeneration"),
    "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
    "KeyeVL1_5ForConditionalGeneration": (
        "keye_vl1_5",
        "KeyeVL1_5ForConditionalGeneration",
    ),
-    "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
-    "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
-    "KimiK25ForConditionalGeneration": ("kimi_k25", "KimiK25ForConditionalGeneration"),  # noqa: E501
-    "MoonshotKimiaForCausalLM": ("kimi_audio", "KimiAudioForConditionalGeneration"),  # noqa: E501
+    "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),
+    "KimiK25ForConditionalGeneration": ("kimi_k25", "KimiK25ForConditionalGeneration"),
+    "MoonshotKimiaForCausalLM": ("kimi_audio", "KimiAudioForConditionalGeneration"),
    "LightOnOCRForConditionalGeneration": (
        "lightonocr",
        "LightOnOCRForConditionalGeneration",
    ),
    "Lfm2VlForConditionalGeneration": ("lfm2_vl", "Lfm2VLForConditionalGeneration"),
+    "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),
    "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"),
-    "Llama4ForConditionalGeneration": ("mllama4", "Llama4ForConditionalGeneration"),  # noqa: E501
    "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
    "LlavaNextForConditionalGeneration": (
        "llava_next",
@@ -446,7 +438,7 @@ _MULTIMODAL_MODELS = {
        "llava_onevision",
        "LlavaOnevisionForConditionalGeneration",
    ),
-    "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
+    "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),
    "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"),
    "MiniMaxVL01ForConditionalGeneration": (
        "minimax_vl_01",
@@ -460,7 +452,9 @@ _MULTIMODAL_MODELS = {
    ),
    "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
    "Molmo2ForConditionalGeneration": ("molmo2", "Molmo2ForConditionalGeneration"),
+    "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
    "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
+    "OpenCUAForConditionalGeneration": ("opencua", "OpenCUAForConditionalGeneration"),
    "OpenPanguVLForConditionalGeneration": (
        "openpangu_vl",
        "OpenPanguVLForConditionalGeneration",
@@ -479,9 +473,9 @@ _MULTIMODAL_MODELS = {
    ),
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
-    "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
-    "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
-    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),
+    "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
    "Qwen2_5_VLForConditionalGeneration": (
        "qwen2_5_vl",
        "Qwen2_5_VLForConditionalGeneration",
@@ -506,39 +500,40 @@ _MULTIMODAL_MODELS = {
        "qwen3_asr",
        "Qwen3ASRForConditionalGeneration",
    ),
-    "Qwen3ASRRealtimeGeneration": (
-        "qwen3_asr_realtime",
-        "Qwen3ASRRealtimeGeneration",
-    ),
-    "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"),  # noqa: E501
+    "Qwen3ASRRealtimeGeneration": ("qwen3_asr_realtime", "Qwen3ASRRealtimeGeneration"),
+    "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"),
    "Qwen3VLMoeForConditionalGeneration": (
        "qwen3_vl_moe",
        "Qwen3VLMoeForConditionalGeneration",
    ),
-    "Qwen3_5ForConditionalGeneration": (
-        "qwen3_5",
-        "Qwen3_5ForConditionalGeneration",
-    ),
+    "Qwen3_5ForConditionalGeneration": ("qwen3_5", "Qwen3_5ForConditionalGeneration"),
    "Qwen3_5MoeForConditionalGeneration": (
        "qwen3_5",
        "Qwen3_5MoeForConditionalGeneration",
    ),
+    "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"),
    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
-    "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),  # noqa: E501
-    "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),  # noqa: E501
+    "SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"),
+    "StepVLForConditionalGeneration": ("step_vl", "StepVLForConditionalGeneration"),
+    "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"),
+    "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"),
    "Tarsier2ForConditionalGeneration": (
        "qwen2_vl",
        "Tarsier2ForConditionalGeneration",
    ),
    "UltravoxModel": ("ultravox", "UltravoxModel"),
-    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
-    "VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"),  # noqa: E501
+    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),
+    "VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"),
    # [Encoder-decoder]
+    "CohereASRForConditionalGeneration": (
+        "cohere_asr",
+        "CohereASRForConditionalGeneration",
+    ),
    "NemotronParseForConditionalGeneration": (
        "nemotron_parse",
        "NemotronParseForConditionalGeneration",
    ),
-    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
+    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),
 }

 _SPECULATIVE_DECODING_MODELS = {
@@ -648,14 +643,17 @@ _PREVIOUSLY_SUPPORTED_MODELS = {
    "Phi4MultimodalForCausalLM": "0.12.0",
    # encoder-decoder models except whisper
    # have been removed for V0 deprecation.
-    "BartModel": "0.10.2",
-    "BartForConditionalGeneration": "0.10.2",
    "DonutForConditionalGeneration": "0.10.2",
-    "Florence2ForConditionalGeneration": "0.10.2",
-    "MBartForConditionalGeneration": "0.10.2",
    "MllamaForConditionalGeneration": "0.10.2",
 }

+_OOT_SUPPORTED_MODELS = {
+    "BartModel": "https://github.com/vllm-project/bart-plugin",
+    "BartForConditionalGeneration": "https://github.com/vllm-project/bart-plugin",
+    "Florence2ForConditionalGeneration": "https://github.com/vllm-project/bart-plugin",
+    "MBartForConditionalGeneration": "https://github.com/vllm-project/bart-plugin",
+}
+

 @dataclass(frozen=True)
 class _ModelInfo:
@@ -952,6 +950,14 @@ class _ModelRegistry:
                    "Please use an older version of vLLM if you want to "
                    "use this model architecture."
                )
+            if arch in _OOT_SUPPORTED_MODELS:
+                plugin_url = _OOT_SUPPORTED_MODELS[arch]
+
+                raise ValueError(
+                    f"Model architecture {arch} is not supported in-tree anymore. "
+                    f"Please install the plugin at {plugin_url} if you want to "
+                    "use this model architecture."
+                )

        raise ValueError(
            f"Model architectures {architectures} are not supported for now. "

--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -10,6 +10,7 @@ from transformers import RobertaConfig

 from vllm.config import ModelConfig, PoolerConfig, VllmConfig
 from vllm.model_executor.layers.pooler import (
+    BgeM3Pooler,
    BOSEOSFilter,
    DispatchPooler,
    Pooler,
@@ -216,24 +217,29 @@ class BgeM3EmbeddingModel(RobertaEmbeddingModel):
        self.colbert_linear = nn.Linear(
            self.hidden_size, self.hidden_size, dtype=self.head_dtype
        )
+        embed_pooler = pooler_for_embed(pooler_config)
+        token_classify_pooler = BOSEOSFilter(
+            pooler_for_token_classify(
+                pooler_config,
+                pooling=AllPool(),
+                classifier=self.sparse_linear,
+                act_fn=torch.relu,
+            ),
+            self.bos_token_id,
+            self.eos_token_id,
+        )

        return DispatchPooler(
            {
-                "embed": pooler_for_embed(pooler_config),
+                "embed": embed_pooler,
                "token_embed": BOSEOSFilter(
                    pooler_for_token_embed(pooler_config, self.colbert_linear),
                    self.bos_token_id,
                    # for some reason m3 only filters the bos for colbert vectors
                ),
-                "token_classify": BOSEOSFilter(
-                    pooler_for_token_classify(
-                        pooler_config,
-                        pooling=AllPool(),
-                        classifier=self.sparse_linear,
-                        act_fn=torch.relu,
-                    ),
-                    self.bos_token_id,
-                    self.eos_token_id,
+                "token_classify": token_classify_pooler,
+                "embed&token_classify": BgeM3Pooler(
+                    token_classify_pooler, embed_pooler
                ),
            }
        )

--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -7,14 +7,12 @@
 # Copyright (c) 2025 Skywork
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Iterable, Mapping
 from typing import Annotated, Literal, TypeAlias

 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import PretrainedConfig

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -26,40 +24,23 @@ from vllm.model_executor.models.intern_vit import (
    InternVisionPatchModel,
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.inputs import (
-    MultiModalDataDict,
-    MultiModalFieldConfig,
-    MultiModalKwargsItems,
-)
-from vllm.multimodal.parse import (
-    ImageEmbeddingItems,
-    ImageProcessorItems,
-    ImageSize,
-    MultiModalDataItems,
-)
-from vllm.multimodal.processing import (
-    BaseDummyInputsBuilder,
-    BaseMultiModalProcessor,
-    BaseProcessingInfo,
-    PromptReplacement,
-    PromptUpdate,
-    PromptUpdateDetails,
-)
+from vllm.multimodal.inputs import MultiModalDataDict
+from vllm.multimodal.processing import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.processors.internvl import (
+    InternVLImageProcessor,
+    InternVLProcessor,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .internvl import (
+    BaseInternVLDummyInputsBuilder,
+    BaseInternVLMultiModalProcessor,
+    BaseInternVLProcessingInfo,
+)
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_CONTEXT = "<IMG_CONTEXT>"
-
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-

 class SkyworkR1VImagePixelInputs(TensorSchema):
    """
@@ -106,418 +87,36 @@ SkyworkR1VImageInputs: TypeAlias = (
 )


-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
-def build_transform(input_size: int):
-    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
-    return T.Compose(
-        [
-            T.Lambda(lambda img: convert_image_mode(img, "RGB")),
-            T.Resize(
-                (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
-            ),
-            T.ToTensor(),
-            T.Normalize(mean=MEAN, std=STD),
-        ]
-    )
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
-def find_closest_aspect_ratio(
-    aspect_ratio: float,
-    target_ratios: list[tuple[int, int]],
-    *,
-    width: int,
-    height: int,
-    image_size: int,
-) -> tuple[int, int]:
-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def resolve_skyworkr1v_min_max_num(
-    *,
-    min_dynamic_patch: int,
-    max_dynamic_patch: int,
-    dynamic_image_size: bool,
-    use_thumbnail: bool,
-) -> tuple[int, int]:
-    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
-    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
-
-    if use_thumbnail and max_dynamic_patch != 1:
-        max_dynamic_patch += 1
-
-    return min_dynamic_patch, max_dynamic_patch
-
-
-def get_skyworkr1v_target_ratios(
-    min_num: int,
-    max_num: int,
-) -> list[tuple[int, int]]:
-    target_ratios = {
-        (i, j)
-        for n in range(min_num, max_num + 1)
-        for i in range(1, n + 1)
-        for j in range(1, n + 1)
-        if min_num <= i * j <= max_num
-    }
-    return sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-
-def calculate_skyworkr1v_targets(
-    *,
-    orig_width: int,
-    orig_height: int,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> tuple[int, int, int]:
-    aspect_ratio = orig_width / orig_height
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio,
-        target_ratios,
-        width=orig_width,
-        height=orig_height,
-        image_size=image_size,
-    )
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # add thumbnail image if num_blocks != 1
-    if use_thumbnail and blocks != 1:
-        blocks += 1
-
-    return blocks, target_width, target_height
-
-
-def dynamic_preprocess_skyworkr1v(
-    image: Image.Image,
-    *,
-    target_ratios: list[tuple[int, int]],
-    image_size: int,
-    use_thumbnail: bool,
-) -> list[Image.Image]:
-    orig_width, orig_height = image.size
-
-    # calculate the number of blocks without thumbnail
-    blocks, target_width, target_height = calculate_skyworkr1v_targets(
-        orig_width=orig_width,
-        orig_height=orig_height,
-        target_ratios=target_ratios,
-        image_size=image_size,
-        use_thumbnail=False,
-    )
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size,
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-
-    assert len(processed_images) == blocks
-
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-
-    return processed_images
-
-
-# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
-def image_to_pixel_values_skyworkr1v(
-    image: Image.Image,
-    *,
-    input_size: int,
-    min_num: int,
-    max_num: int,
-    use_thumbnail: bool,
-) -> torch.Tensor:
-    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
-
-    transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess_skyworkr1v(
-        image,
-        target_ratios=target_ratios,
-        image_size=input_size,
-        use_thumbnail=use_thumbnail,
-    )
-
-    pixel_values = torch.stack([transform(image) for image in images])
-    return pixel_values
-
-
-class SkyworkR1VProcessor:
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    The code to insert image tokens is based on:
-    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        image_size: int = config.vision_config.image_size
-        patch_size: int = config.vision_config.patch_size
-
-        if min_dynamic_patch is None:
-            min_dynamic_patch = config.min_dynamic_patch
-        assert isinstance(min_dynamic_patch, int)
-
-        if max_dynamic_patch is None:
-            max_dynamic_patch = config.max_dynamic_patch
-        assert isinstance(max_dynamic_patch, int)
-
-        if dynamic_image_size is None:
-            dynamic_image_size = config.dynamic_image_size
-        assert isinstance(dynamic_image_size, bool)
-
-        self.num_image_token = int(
-            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
-        )
-        self.image_size = image_size
-        self.min_dynamic_patch = min_dynamic_patch
-        self.max_dynamic_patch = max_dynamic_patch
-        self.dynamic_image_size = dynamic_image_size
-        self.use_thumbnail: bool = config.use_thumbnail
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_CONTEXT]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        repl_features = IMG_CONTEXT * feature_size
-        repl_full = IMG_START + repl_features + IMG_END
-
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
-
-    def resolve_min_max_num(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> tuple[int, int]:
-        min_dynamic_patch = (
-            self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
-        )
-        max_dynamic_patch = (
-            self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
-        )
-        dynamic_image_size = (
-            self.dynamic_image_size
-            if dynamic_image_size is None
-            else dynamic_image_size
-        )
-        use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
-
-        return resolve_skyworkr1v_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-    def resolve_target_ratios(
-        self,
-        *,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        use_thumbnail: bool | None = None,
-    ) -> list[tuple[int, int]]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=use_thumbnail,
-        )
-
-        return get_skyworkr1v_target_ratios(min_num, max_num)
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        target_ratios = self.resolve_target_ratios(
-            use_thumbnail=False,  # Applied in calculate_targets
-        )
-
-        num_patches, _, _ = calculate_skyworkr1v_targets(
-            orig_width=image_width,
-            orig_height=image_height,
-            image_size=self.image_size,
-            target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
-        )
-
-        return num_patches * self.num_image_token
-
-    def _images_to_pixel_values_lst(
-        self,
-        images: list[Image.Image],
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-    ) -> list[torch.Tensor]:
-        min_num, max_num = self.resolve_min_max_num(
-            min_dynamic_patch=min_dynamic_patch,
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
-            use_thumbnail=False,  # Applied in image_to_pixel_values
-        )
-
-        return [
-            image_to_pixel_values_skyworkr1v(
-                image,
-                input_size=self.image_size,
-                min_num=min_num,
-                max_num=max_num,
-                use_thumbnail=self.use_thumbnail,
-            )
-            for image in images
-        ]
-
-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        min_dynamic_patch: int | None = None,
-        max_dynamic_patch: int | None = None,
-        dynamic_image_size: bool | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values_lst = self._images_to_pixel_values_lst(
-                images,
-                min_dynamic_patch=min_dynamic_patch,
-                max_dynamic_patch=max_dynamic_patch,
-                dynamic_image_size=dynamic_image_size,
-            )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": torch.tensor(
-                    [len(item) for item in pixel_values_lst]
-                ),
-            }
-
-            for pixel_values in pixel_values_lst:
-                num_patches = pixel_values.shape[0]
-                feature_size = num_patches * self.num_image_token
-
-                image_repl = self.get_image_repl(feature_size, num_patches)
+class SkyworkR1VProcessingInfo(BaseInternVLProcessingInfo):
+    def get_image_processor(self, **kwargs):
+        config = self.get_hf_config()
+        vision_config = config.vision_config

-                text = [t.replace("<image>", image_repl.full, 1) for t in text]
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", vision_config.image_size)
+        kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", config.use_thumbnail)

-        text_inputs = self.tokenizer(text)
+        return InternVLImageProcessor(**kwargs)

-        combined_outputs = {**text_inputs, **image_inputs}
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
+        config = self.get_hf_config()
+        vision_config = config.vision_config

-        return BatchFeature(combined_outputs, tensor_type=return_tensors)
+        image_processor = self.get_image_processor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))

-
-class SkyworkR1VProcessingInfo(BaseProcessingInfo):
-    def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
-        return self.ctx.init_processor(
-            SkyworkR1VProcessor,
-            config=self.get_hf_config(),
+        return InternVLProcessor(
            tokenizer=self.get_tokenizer(),
-            **kwargs,
-        )
-
-    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
-        return {"image": None}
-
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-        processor: SkyworkR1VProcessor,
-    ) -> int:
-        return processor.get_num_image_tokens(
-            image_width=image_width,
-            image_height=image_height,
+            image_processor=image_processor,
+            image_seq_length=image_seq_length,
        )

-    def get_image_size_with_most_features(self) -> ImageSize:
-        processor = self.get_hf_processor()
-
-        base_size = processor.image_size
-        target_ratios = processor.resolve_target_ratios()
-
-        largest_feature_size, largest_feature_pinpoint = 0, None
-        for wr, hr in target_ratios:
-            width, height = base_size * wr, base_size * hr
-
-            feat_size = self.get_num_image_tokens(
-                image_width=width,
-                image_height=height,
-                processor=processor,
-            )
-            if feat_size > largest_feature_size:
-                largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width, height=height)
-
-        if largest_feature_size == 0 or largest_feature_pinpoint is None:
-            raise ValueError("Cannot have a largest feature size of 0!")
-
-        return largest_feature_pinpoint
-

 class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]):
    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
@@ -546,102 +145,10 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn
        }


-class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessingInfo]):
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-        tok_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        processed_outputs = super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
-            tok_kwargs=tok_kwargs,
-        )
-
-        hf_processor = self.info.get_hf_processor(**mm_kwargs)
-        image_token_id = hf_processor.image_token_id
-
-        # Since there may be extra tokens in the feature placeholders,
-        # we need to pass the image token ID to the model to select the
-        # tokens to merge from the vision encoder outputs
-        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
-
-        return processed_outputs
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
-        num_images = len(image_num_patches)
-
-        return dict(
-            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_num_patches
-            ),
-            image_num_patches=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-            image_token_id=MultiModalFieldConfig.shared("image", num_images),
-        )
-
-    def _get_prompt_updates(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargsItems,
-    ) -> Sequence[PromptUpdate]:
-        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-
-        out_mm_data = out_mm_kwargs.get_data()
-        if "image_num_patches" in out_mm_data:
-            image_num_patches = out_mm_data["image_num_patches"]
-            assert isinstance(image_num_patches, torch.Tensor)
-            image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_data:
-            # TODO: Use image size information in dictionary embedding inputs
-            # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_data["image_embeds"])
-        else:
-            image_num_patches = []
-
-        def get_replacement_skyworkr1v(item_idx: int):
-            images = mm_items.get_items(
-                "image", (ImageEmbeddingItems, ImageProcessorItems)
-            )
-
-            if isinstance(images, ImageEmbeddingItems):
-                feature_size = images.get_feature_size(item_idx)
-            else:
-                image_size = images.get_image_size(item_idx)
-                feature_size = self.info.get_num_image_tokens(
-                    image_width=image_size.width,
-                    image_height=image_size.height,
-                    processor=hf_processor,
-                )
-
-            num_patches = image_num_patches[item_idx]
-            if num_patches is not None:
-                assert isinstance(num_patches, int)
-
-            return hf_processor.get_image_repl(feature_size, num_patches)
-
-        return [
-            PromptReplacement(
-                modality="image",
-                target="<image>",
-                replacement=get_replacement_skyworkr1v,
-            )
-        ]
-
-
 @MULTIMODAL_REGISTRY.register_processor(
-    SkyworkR1VMultiModalProcessor,
+    BaseInternVLMultiModalProcessor,
    info=SkyworkR1VProcessingInfo,
-    dummy_inputs=SkyworkR1VDummyInputsBuilder,
+    dummy_inputs=BaseInternVLDummyInputsBuilder,
 )
 class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
    @classmethod

--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -2,18 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from itertools import product
-from math import ceil, sqrt
+from math import sqrt
 from typing import Annotated, Any, Literal, TypeAlias

-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchFeature

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -43,8 +38,12 @@ from vllm.multimodal.processing import (
    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.tokenizers import TokenizerLike
-from vllm.transformers_utils.configs import Step3VisionEncoderConfig
+from vllm.transformers_utils.configs.step3_vl import Step3VisionEncoderConfig
+from vllm.transformers_utils.processors.step3_vl import (
+    MAX_IMAGE_SIZE,
+    Step3VLImageProcessor,
+    Step3VLProcessor,
+)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -89,447 +88,32 @@ class Step3VLImageEmbeddingInputs(TensorSchema):

 Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingInputs

-ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool] | None]
-
-MAX_IMAGE_SIZE: int = 3024
-
-
-class Step3VisionProcessor:
-    def __init__(self, size, interpolation_mode="bicubic", patch_size=None):
-        mean = [0.48145466, 0.4578275, 0.40821073]
-        std = [0.26862954, 0.26130258, 0.27577711]
-        patch_size = patch_size if patch_size is not None else size
-
-        self.transform = transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Normalize(mean, std),
-                transforms.Resize(
-                    (size, size),
-                    interpolation=InterpolationMode.BICUBIC
-                    if interpolation_mode == "bicubic"
-                    else InterpolationMode.BILINEAR,
-                    antialias=True,
-                ),
-            ]
-        )
-
-        self.patch_transform = (
-            transforms.Compose(
-                [
-                    transforms.ToTensor(),
-                    transforms.Normalize(mean, std),
-                    transforms.Resize(
-                        (patch_size, patch_size),
-                        interpolation=InterpolationMode.BICUBIC
-                        if interpolation_mode == "bicubic"
-                        else InterpolationMode.BILINEAR,
-                        antialias=True,
-                    ),
-                ]
-            )
-            if patch_size is not None
-            else None
-        )
-
-    def __call__(self, image, is_patch=False):
-        if is_patch:
-            return {"pixel_values": self.patch_transform(image).unsqueeze(0)}
-        else:
-            return {"pixel_values": self.transform(image).unsqueeze(0)}
-
-
-class ImagePatcher:
-    def __init__(self, enable_patch: bool = True) -> None:
-        self.enable_patch = enable_patch
-
-    def determine_window_size(self, long: int, short: int) -> int:
-        if long < 728:
-            return short if long / short > 1.5 else 0
-        return min(short, 504) if long / short > 4 else 504
-
-    def slide_window(
-        self,
-        width: int,
-        height: int,
-        sizes: list[tuple[int, int]],
-        steps: list[tuple[int, int]],
-        img_rate_thr: float = 0.6,
-    ) -> tuple[list[tuple[int, int, int, int]], tuple[int, int]]:
-        assert 1 >= img_rate_thr >= 0, "The `in_rate_thr` should lie in 0~1"
-        windows = []
-        # Sliding windows.
-        for size, step in zip(sizes, steps):
-            size_w, size_h = size
-            step_w, step_h = step
-
-            x_num = 1 if width <= size_w else ceil((width - size_w) / step_w + 1)
-            x_start = [step_w * i for i in range(x_num)]
-            if len(x_start) > 1 and x_start[-1] + size_w > width:
-                x_start[-1] = width - size_w
-
-            y_num = 1 if height <= size_h else ceil((height - size_h) / step_h + 1)
-            y_start = [step_h * i for i in range(y_num)]
-            if len(y_start) > 1 and y_start[-1] + size_h > height:
-                y_start[-1] = height - size_h
-
-            start = np.array(list(product(y_start, x_start)), dtype=int)
-            start[:, [0, 1]] = start[:, [1, 0]]
-            windows.append(np.concatenate([start, start + size], axis=1))
-        windows = np.concatenate(windows, axis=0)
-
-        return [
-            (int(box[0]), int(box[1]), int(box[2] - box[0]), int(box[3] - box[1]))
-            for box in windows
-        ], (x_num, y_num)
-
-    def square_pad(self, img: Image.Image) -> Image.Image:
-        w, h = img.size
-        if w == h:
-            return img
-        size = max(w, h)
-        padded = Image.new(img.mode, (size, size), 0)
-        padded.paste(img, (0, 0))
-        return padded
-
-    def get_image_size_for_padding(
-        self, img_width: int, img_height: int
-    ) -> tuple[int, int]:
-        ratio = img_width / img_height
-        if min(img_height, img_width) < 32 and (ratio > 4 or ratio < 1 / 4):
-            new_size = max(img_height, img_width)
-            return new_size, new_size
-        return img_width, img_height
-
-    def get_image_size_for_preprocess(
-        self, img_width: int, img_height: int
-    ) -> tuple[int, int]:
-        if max(img_height, img_width) > MAX_IMAGE_SIZE:
-            scale_factor = MAX_IMAGE_SIZE / max(img_height, img_width)
-            img_width = int(img_width * scale_factor)
-            img_height = int(img_height * scale_factor)
-        return img_width, img_height
-
-    def get_image_size_for_crop(
-        self, img_width: int, img_height: int, window_size: int
-    ):
-        w_ratio = img_width / window_size
-        h_ratio = img_height / window_size
-
-        if w_ratio < 1:
-            width_new = img_width
-        else:
-            decimal_w = w_ratio - img_width // window_size
-            w_ratio = int(w_ratio) + 1 if decimal_w > 0.2 else int(w_ratio)
-            width_new = window_size * w_ratio
-        if h_ratio < 1:
-            height_new = img_height
-        else:
-            decimal_h = h_ratio - img_height // window_size
-            h_ratio = int(h_ratio) + 1 if decimal_h > 0.2 else int(h_ratio)
-            height_new = window_size * h_ratio
-        return int(width_new), int(height_new)
-
-    def patch_crop(self, img: Image.Image, i: int, j: int, th: int, tw: int):
-        target = img.crop((j, i, j + tw, i + th))
-        return target
-
-    def get_num_patches(self, img_width: int, img_height: int) -> tuple[int, int]:
-        img_width, img_height = self.get_image_size_for_padding(img_width, img_height)
-        img_width, img_height = self.get_image_size_for_preprocess(
-            img_width, img_height
-        )
-        window_size = self.determine_window_size(
-            max(img_height, img_width), min(img_height, img_width)
-        )
-        if window_size == 0 or not self.enable_patch:
-            return 0, 0
-        else:
-            img_width, img_height = self.get_image_size_for_crop(
-                img_width, img_height, window_size
-            )
-            center_list, (x_num, y_num) = self.slide_window(
-                img_width,
-                img_height,
-                [(window_size, window_size)],
-                [(window_size, window_size)],
-            )
-            full_rows = (len(center_list) - 1) // x_num + 1
-            if len(center_list) > 0 and len(center_list) % x_num == 0:
-                full_rows -= 1
-            return len(center_list), full_rows
-
-    def __call__(
-        self, img: Image.Image
-    ) -> tuple[Image.Image, list[Image.Image], list[bool] | None]:
-        img_width, img_height = img.size
-        new_img_width, new_img_height = self.get_image_size_for_padding(
-            img_width, img_height
-        )
-        if new_img_width != img_width or new_img_height != img_height:
-            img = self.square_pad(img)
-            img_width, img_height = img.size
-
-        new_img_width, new_img_height = self.get_image_size_for_preprocess(
-            img_width, img_height
-        )
-        img = img.resize((new_img_width, new_img_height), Image.Resampling.BILINEAR)
-        window_size = self.determine_window_size(
-            max(new_img_height, new_img_width), min(new_img_height, new_img_width)
-        )
-
-        if window_size == 0 or not self.enable_patch:
-            return img, [], None
-        else:
-            new_img_width, new_img_height = self.get_image_size_for_crop(
-                new_img_width, new_img_height, window_size
-            )
-            if (new_img_width, new_img_height) != (img_width, img_height):
-                img_for_crop = img.resize(
-                    (new_img_width, new_img_height), Image.Resampling.BILINEAR
-                )
-            else:
-                img_for_crop = img
-
-            patches = []
-            newlines = []
-            center_list, (x_num, y_num) = self.slide_window(
-                new_img_width,
-                new_img_height,
-                [(window_size, window_size)],
-                [(window_size, window_size)],
-            )
-            for patch_id, center_lf_point in enumerate(center_list):
-                x, y, patch_w, patch_h = center_lf_point
-                big_patch = self.patch_crop(img_for_crop, y, x, patch_h, patch_w)
-                patches.append(big_patch)
-                if (patch_id + 1) % x_num == 0:
-                    newlines.append(patch_id)
-
-            if newlines and newlines[-1] == len(patches) - 1:
-                newlines.pop()
-
-            return (
-                img,
-                patches,
-                [i in newlines for i in range(len(patches))]
-                if len(patches) > 0
-                else None,
-            )
-
-
-class Step3VLProcessor:
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: TokenizerLike,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-        self.image_size = 728
-        self.patch_size = 504
-        self.image_preprocessor = Step3VisionProcessor(
-            self.image_size, "bilinear", self.patch_size
-        )
-
-        self.num_image_feature_size = 169
-        self.num_patch_feature_size = 81
-        self.image_token = "<im_patch>"
-        self.image_feature_placeholder = self.image_token * self.num_image_feature_size
-        self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size
-
-        # Respect vision config switch to enable/disable patch extraction.
-        # For video understanding, it's preferable to disable patch.
-        enable_patch = getattr(self.config.vision_config, "enable_patch", True)
-        self.patcher = ImagePatcher(enable_patch=enable_patch)
-
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[self.image_token]
-
-    def get_num_image_tokens(self, img_width: int, img_height: int) -> int:
-        num_patches, num_newlines = self.patcher.get_num_patches(img_width, img_height)
-
-        return (
-            num_patches * (self.num_patch_feature_size + 2)
-            + self.num_image_feature_size
-            + 2
-            + num_newlines
-        )
-
-    def _split_images(self, images: list[Image.Image]) -> list[ImageWithPatches]:
-        result = []
-        for img in images:
-            result.append(self.patcher(img))
-        return result
-
-    def _convert_images_to_pixel_values(
-        self,
-        images: list[Image.Image],
-        is_patch: bool = False,
-    ) -> list[torch.Tensor]:
-        return [
-            self.image_preprocessor(img, is_patch=is_patch)["pixel_values"]
-            for img in images
-        ]
-
-    def _get_patch_repl(
-        self,
-        num_patches: int,
-        patch_newline_mask: list[bool] | None,
-    ) -> tuple[str, list[int]]:
-        text = ""
-        token_ids = []
-        for i in range(num_patches):
-            assert len(patch_newline_mask) == num_patches
-            text += f"<patch_start>{self.patch_feature_placeholder}<patch_end>"
-            token_ids.extend(
-                [self.tokenizer.convert_tokens_to_ids("<patch_start>")]
-                + [self.image_token_id] * self.num_patch_feature_size
-                + [self.tokenizer.convert_tokens_to_ids("<patch_end>")]
-            )
-            if patch_newline_mask and patch_newline_mask[i]:
-                text += "<patch_newline>"
-                token_ids.append(
-                    self.tokenizer.convert_tokens_to_ids("<patch_newline>")
-                )
-        return text, token_ids
-
-    def _get_image_repl(
-        self,
-        num_images: int,
-    ) -> tuple[str, list[int]]:
-        text = f"<im_start>{self.image_feature_placeholder}<im_end>"
-        token_ids = (
-            [self.tokenizer.convert_tokens_to_ids("<im_start>")]
-            + [self.image_token_id] * self.num_image_feature_size
-            + [self.tokenizer.convert_tokens_to_ids("<im_end>")]
-        )
-        return text * num_images, token_ids * num_images

-    def _get_image_repl_features(
-        self,
-        num_images: int,
-        num_patches: int,
-        patch_new_line_idx: list[bool] | None,
-    ) -> tuple[str, list[int]]:
-        if num_patches > 0:
-            patch_repl, patch_repl_ids = self._get_patch_repl(
-                num_patches, patch_new_line_idx
-            )
-        else:
-            patch_repl = ""
-            patch_repl_ids = []
-        image_repl, image_repl_ids = self._get_image_repl(num_images)
-        return patch_repl + image_repl, patch_repl_ids + image_repl_ids
-
-    def replace_placeholder(self, text: str, placeholder: str, repls: list[str]) -> str:
-        parts = text.split(placeholder)
-
-        if len(parts) - 1 != len(repls):
-            raise ValueError(
-                "The number of placeholders does not match the number of replacements."
-            )
-
-        result = [parts[0]]
-        for i, repl in enumerate(repls):
-            result.append(repl)
-            result.append(parts[i + 1])
-
-        return "".join(result)
+class Step3VLProcessingInfo(BaseProcessingInfo):
+    def get_image_processor(self, **kwargs):
+        config = self.get_hf_config()

-    def __call__(
-        self,
-        text: str | list[str] | None = None,
-        images: Image.Image | list[Image.Image] | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
-        if len(images) == 0:
-            image_inputs = {}
-            text_inputs = self.tokenizer(text)
-        else:
-            split_images_data = self._split_images(images)
-            pixel_values_lst = []
-            patch_pixel_values_lst = []
-            patch_newline_mask_lst = []
-            image_repl_str_lst = []
-            image_repl_ids_lst = []
-            num_patches = []
-            for raw_img, img_patches, patch_newline_mask in split_images_data:
-                pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
-
-                if len(img_patches) > 0:
-                    patch_pixel_values_lst.extend(
-                        self._convert_images_to_pixel_values(img_patches, is_patch=True)
-                    )
-                num_patches.append(len(img_patches))
-
-                image_repl_str, image_repl_ids = self._get_image_repl_features(
-                    1, len(img_patches), patch_newline_mask
-                )
-                image_repl_str_lst.append(image_repl_str)
-                image_repl_ids_lst.extend(image_repl_ids)
-
-                if patch_newline_mask is not None:
-                    patch_newline_mask_lst.extend(patch_newline_mask)
-
-            pixel_values = torch.cat(pixel_values_lst)
-            patch_size = self.patch_size
-            image_inputs = {
-                "pixel_values": pixel_values,
-                "num_patches": num_patches,
-                "patch_pixel_values": (
-                    torch.cat(patch_pixel_values_lst)
-                    if patch_pixel_values_lst
-                    else pixel_values.new_empty((0, 3, patch_size, patch_size))
-                ),
-                "patch_newline_mask": torch.tensor(
-                    patch_newline_mask_lst, dtype=torch.bool
-                ),
-            }
-
-            text = [
-                self.replace_placeholder(t, self.image_token, image_repl_str_lst)
-                for t in text
-            ]
-            text_inputs = self.tokenizer(text)
-
-        return BatchFeature(
-            {
-                **text_inputs,
-                **image_inputs,
-            },
-            tensor_type=return_tensors,
+        kwargs.setdefault(
+            "enable_patch",
+            getattr(config.vision_config, "enable_patch", True),
        )

+        return Step3VLImageProcessor(**kwargs)

-class Step3VLProcessingInfo(BaseProcessingInfo):
    def get_hf_processor(self) -> Step3VLProcessor:
        return Step3VLProcessor(
-            self.get_hf_config(),
-            self.get_tokenizer(),
+            tokenizer=self.get_tokenizer(),
+            image_processor=self.get_image_processor(),
        )

    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"image": None}

    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        return hf_processor.get_num_image_tokens(
-            self.get_image_size_with_most_features().width,
-            self.get_image_size_with_most_features().height,
-        )
+        image_processor = self.get_image_processor()
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return image_processor.get_num_image_tokens(target_width, target_height)

    def get_mm_max_tokens_per_item(
        self,
@@ -539,20 +123,7 @@ class Step3VLProcessingInfo(BaseProcessingInfo):
        return {"image": self.get_max_image_tokens()}

    def get_image_size_with_most_features(self) -> ImageSize:
-        return ImageSize(3024, 3024)
-
-    def get_num_mm_tokens(self, mm_data: MultiModalDataDict) -> int:
-        if len(mm_data) != 1 or "image" not in mm_data:
-            raise ValueError("mm_data could only contain one key 'image' for steo1o")
-
-        image_data = mm_data["image"]
-        if not isinstance(image_data, (list, tuple)):
-            image_data = [image_data]
-
-        return sum(
-            self.get_hf_processor().get_num_image_tokens(img.width, img.height)
-            for img in image_data
-        )
+        return ImageSize(MAX_IMAGE_SIZE, MAX_IMAGE_SIZE)


 class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]):
@@ -594,13 +165,11 @@ class Step3VLMultiModalProcessor(BaseMultiModalProcessor[Step3VLProcessingInfo])
        def get_replacement_step1o(item_idx: int):
            out_item = out_mm_kwargs["image"][item_idx]
            num_patches = int(out_item["num_patches"].data)
-            if num_patches > 0:
-                patch_newline_mask = out_item["patch_newline_mask"].data
-                image_repl_ids = hf_processor._get_image_repl_features(
-                    1, num_patches, patch_newline_mask.tolist()
-                )[1]
-            else:
-                image_repl_ids = hf_processor._get_image_repl_features(1, 0, None)[1]
+            patch_newline_mask = out_item["patch_newline_mask"].data
+            image_repl_ids = hf_processor.get_image_repl_feature_ids(
+                1, num_patches, patch_newline_mask.tolist()
+            )
+
            return PromptUpdateDetails.select_token_id(
                seq=image_repl_ids,
                embed_token_id=image_placeholder_token_id,

--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -25,7 +25,6 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelL
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import (
    ImageEmbeddingItems,
@@ -34,10 +33,8 @@ from vllm.multimodal.parse import (
    MultiModalDataItems,
 )
 from vllm.multimodal.processing import (
-    BaseDummyInputsBuilder,
    BaseMultiModalProcessor,
    BaseProcessingInfo,
-    InputProcessingContext,
    PromptReplacement,
    PromptUpdate,
 )
@@ -329,25 +326,6 @@ class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]):
        ]


-def _build_tarsier_hf_info(ctx: InputProcessingContext) -> TarsierProcessingInfo:
-    return TarsierProcessingInfo(ctx)
-
-
-def _build_tarsier_hf_processor(
-    info: _I_Tarsier,
-    dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
-    *,
-    cache: BaseMultiModalProcessorCache | None = None,
-) -> BaseMultiModalProcessor:
-    if isinstance(info, TarsierProcessingInfo):
-        return TarsierMultiModalProcessor(
-            info,
-            dummy_inputs,
-            cache=cache,
-        )
-    raise NotImplementedError(type(info))
-
-
 def init_vision_tower_for_tarsier(
    hf_config: TarsierHfConfig,  # Use the Tarsier specific config protocol
    quant_config: QuantizationConfig | None,
@@ -395,8 +373,8 @@ def init_vision_tower_for_tarsier(


 @MULTIMODAL_REGISTRY.register_processor(
-    _build_tarsier_hf_processor,
-    info=_build_tarsier_hf_info,
+    TarsierMultiModalProcessor,
+    info=TarsierProcessingInfo,
    dummy_inputs=TarsierDummyInputsBuilder,
 )
 class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):

--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -404,12 +404,14 @@ class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin):
            kwargs["layer_head_mask"] = None

        for layer in self.layers:
-            layer_outputs = layer(
+            hidden_states = layer(
                hidden_states,
                attention_mask=extended_attention_mask,
                **kwargs,
            )
-            hidden_states = layer_outputs[0]
+            # BC version that allows for the old tupled output
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]

        hidden_states = self.ln_post(hidden_states)
        hidden_states = self.linear_out(hidden_states)
@@ -509,13 +511,14 @@ class ModifiedWhisperEncoder(WhisperEncoder):
            kwargs["layer_head_mask"] = None

        for encoder_layer in self.layers:
-            layer_outputs = encoder_layer(
+            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
                **kwargs,
            )
-
-            hidden_states = layer_outputs[0]
+            # BC version that allows for the old tupled output
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]

        hidden_states = self.layer_norm(hidden_states)
        return hidden_states

--- a/vllm/model_executor/models/whisper_causal.py
+++ b/vllm/model_executor/models/whisper_causal.py
@@ -150,8 +150,10 @@ def create_whisper_attention_backend_with_block_pooling(
            new_common_attn_metadata.query_start_loc *= block_pool_size
            new_common_attn_metadata.query_start_loc_cpu *= block_pool_size
            new_common_attn_metadata.seq_lens *= block_pool_size
-            new_common_attn_metadata._seq_lens_cpu *= block_pool_size
-            new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
+            if new_common_attn_metadata._seq_lens_cpu is not None:
+                new_common_attn_metadata._seq_lens_cpu *= block_pool_size
+            if new_common_attn_metadata._num_computed_tokens_cpu is not None:
+                new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
            new_common_attn_metadata.num_actual_tokens *= block_pool_size
            new_common_attn_metadata.max_query_len *= block_pool_size
            new_common_attn_metadata.max_seq_len *= block_pool_size

--- a/vllm/model_executor/offloader/prefetch.py
+++ b/vllm/model_executor/offloader/prefetch.py
@@ -431,10 +431,32 @@ class _ModuleOffloader:

        Called after process_weights_after_loading to ensure _cpu_storage
        contains the final processed weights, not stale pre-loading data.
+
+        Parameters whose underlying nn.Parameter was deleted by
+        process_weights_after_loading (e.g. transient KV-cache scale params)
+        are pruned from self._param_offloaders so they do not participate in
+        buffer-pool allocation or prefetching.
        """
        for param_offloader in self._param_offloaders.values():
            param_offloader.sync_cpu_storage()

+        # Remove offloaders whose parameter was deleted during
+        # process_weights_after_loading (e.g. k_scale / v_scale).
+        deleted = [
+            name
+            for name, offloader in self._param_offloaders.items()
+            if getattr(offloader, "_param_deleted", False)
+        ]
+        if deleted:
+            logger.debug(
+                "Pruning %d transient offloaded param(s) that were deleted "
+                "by process_weights_after_loading: %s",
+                len(deleted),
+                deleted,
+            )
+            for name in deleted:
+                del self._param_offloaders[name]
+
    def get_param_infos(self) -> list[ParamInfo]:
        """Get parameter metadata for buffer pool allocation.

@@ -590,6 +612,11 @@ class _CpuParamOffloader(_BaseParamOffloader):
        super().__init__(module, param_name)
        self._cpu_storage: torch.Tensor | None = None
        self._gpu_buffer: torch.Tensor | None = None  # Store reference to GPU buffer
+        # Set to True if the underlying nn.Parameter was deleted by
+        # process_weights_after_loading (e.g. transient KV-cache scale params
+        # such as k_scale/v_scale created by BaseKVCacheMethod.create_weights
+        # and deleted after copying into permanent _k_scale buffers).
+        self._param_deleted: bool = False

        # Offload to CPU immediately to free GPU memory during model loading
        self._offload_to_cpu_internal()
@@ -696,8 +723,22 @@ class _CpuParamOffloader(_BaseParamOffloader):
        1. process_weights_after_loading may transform weights (quantization)
        2. device_loading_context creates NEW CPU tensors when moving back
        3. Our old _cpu_storage would have pre-processed or stale data
+
+        If the parameter no longer exists on the module (e.g. transient
+        KV-cache scale parameters such as k_scale/v_scale that are created
+        by BaseKVCacheMethod.create_weights() and then deleted by
+        process_weights_after_loading() after copying their values into
+        permanent _k_scale buffers), the offloader marks itself as deleted
+        and skips the sync.  The caller (_ModuleOffloader.sync_cpu_storage)
+        is responsible for removing these stale entries.
        """
-        self._update_cpu_storage_from_param()
+        try:
+            self._update_cpu_storage_from_param()
+        except AttributeError:
+            # The parameter was deleted by process_weights_after_loading.
+            # Drop the now-stale CPU storage so this offloader can be pruned.
+            self._param_deleted = True
+            self._cpu_storage = None

    def post_init(self):
        """No-op: offloading done in offload_to_cpu/assign_static_buffer."""

--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -19,6 +19,7 @@ from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
 )
 from vllm.model_executor.layers.linear import LinearBase
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.model_executor.layers.quantization.mxfp8 import Mxfp8OnlineLinearMethod
 from vllm.tracing import instrument
 from vllm.utils.deep_gemm import (
    fp8_gemm_nt,
@@ -136,8 +137,9 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
    if not (
        isinstance(module, LinearBase)
        and isinstance(module.quant_method, Fp8LinearMethod)
-        and module.quant_method.block_quant
-        and not module.quant_method.use_marlin
+        and not isinstance(module.quant_method, Mxfp8OnlineLinearMethod)
+        and getattr(module.quant_method, "block_quant", False)
+        and not getattr(module.quant_method, "use_marlin", True)
    ):
        return False


--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -12,17 +12,35 @@ import torch
 from vllm.utils.import_utils import PlaceholderModule

 try:
-    import librosa
+    import av as av
 except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+    av = PlaceholderModule("av")  # type: ignore[assignment]

+try:
+    import resampy
+except ImportError:
+    resampy = PlaceholderModule("resampy")  # type: ignore[assignment]

 try:
    import scipy.signal as scipy_signal
 except ImportError:
    scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal")  # type: ignore[assignment]

+
 # ============================================================
+# Aligned with `librosa.get_duration` function
+def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float:
+    """Get the duration of an audio array in seconds.
+
+    Args:
+        y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
+        sr: Sample rate of the audio in Hz.
+
+    Returns:
+        Duration of the audio in seconds.
+    """
+    n_samples = y.shape[-1]
+    return float(n_samples) / sr


 class ChannelReduction(str, Enum):
@@ -153,13 +171,71 @@ def normalize_audio(
 # ============================================================


-def resample_audio_librosa(
+def resample_audio_pyav(
    audio: npt.NDArray[np.floating],
    *,
    orig_sr: float,
    target_sr: float,
 ) -> npt.NDArray[np.floating]:
-    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+    """Resample audio using PyAV (libswresample via FFmpeg).
+
+    Args:
+        audio: Input audio. Can be:
+            - 1D array ``(samples,)``: mono audio
+            - 2D array ``(channels, samples)``: stereo audio
+        orig_sr: Original sample rate in Hz.
+        target_sr: Target sample rate in Hz.
+
+    Returns:
+        Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
+    """
+    orig_sr_int = int(round(orig_sr))
+    target_sr_int = int(round(target_sr))
+
+    if orig_sr_int == target_sr_int:
+        return audio
+
+    if audio.ndim == 2:
+        # Resample each channel independently and re-stack.
+        return np.stack(
+            [
+                resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr)
+                for ch in audio
+            ],
+            axis=0,
+        )
+
+    expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int))
+
+    # from_ndarray expects shape (channels, samples) for planar formats.
+    # libswresample requires a minimum number of input samples to produce
+    # output frames; pad short inputs with zeros so we always get output,
+    # then trim to the expected output length.
+    _MIN_SAMPLES = 1024
+    audio_f32 = np.asarray(audio, dtype=np.float32)
+    if len(audio_f32) < _MIN_SAMPLES:
+        audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32)))
+    audio_f32 = audio_f32.reshape(1, -1)
+
+    resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int)
+
+    frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono")
+    frame.sample_rate = orig_sr_int
+
+    out_frames = resampler.resample(frame)
+    out_frames.extend(resampler.resample(None))  # flush buffered samples
+
+    result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0)
+    return result[:expected_len]
+
+
+def resample_audio_resampy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr)


 def resample_audio_scipy(
@@ -167,7 +243,7 @@ def resample_audio_scipy(
    *,
    orig_sr: float,
    target_sr: float,
-):
+) -> npt.NDArray[np.floating]:
    if orig_sr > target_sr:
        return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
    elif orig_sr < target_sr:
@@ -181,7 +257,7 @@ class AudioResampler:
    def __init__(
        self,
        target_sr: float | None = None,
-        method: Literal["librosa", "scipy"] = "librosa",
+        method: Literal["pyav", "resampy", "scipy"] = "resampy",
    ):
        self.target_sr = target_sr
        self.method = method
@@ -203,8 +279,10 @@ class AudioResampler:
            abs_tol=1e-6,
        ):
            return audio
-        if self.method == "librosa":
-            return resample_audio_librosa(
+        if self.method == "pyav":
+            return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr)
+        if self.method == "resampy":
+            return resample_audio_resampy(
                audio, orig_sr=orig_sr, target_sr=self.target_sr
            )
        elif self.method == "scipy":
@@ -214,7 +292,7 @@ class AudioResampler:
        else:
            raise ValueError(
                f"Invalid resampling method: {self.method}. "
-                "Supported methods are 'librosa' and 'scipy'."
+                "Supported methods are 'pyav' and 'scipy'."
            )



--- a/vllm/multimodal/media/audio.py
+++ b/vllm/multimodal/media/audio.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
+import math
 from io import BytesIO
 from pathlib import Path

@@ -15,58 +15,80 @@ from vllm.utils.serial_utils import tensor2base64
 from .base import MediaIO

 try:
-    import librosa
+    import av
 except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+    av = PlaceholderModule("av")  # type: ignore[assignment]

 try:
    import soundfile
 except ImportError:
    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]

+
 try:
-    import av
+    import resampy
 except ImportError:
-    av = PlaceholderModule("av")  # type: ignore[assignment]
+    resampy = PlaceholderModule("resampy")  # type: ignore[assignment]


-def extract_audio_from_video_bytes(
-    data: bytes,
-) -> tuple[npt.NDArray, float]:
-    """Extract the audio track from raw video bytes using PyAV.
+# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
+# being librosa's main backend. Used to validate if an audio loading error is due to a
+# server error vs a client error (invalid audio file).
+# 1 = unrecognised format      (file is not a supported audio container)
+# 3 = malformed file           (corrupt or structurally invalid audio)
+# 4 = unsupported encoding     (codec not supported by this libsndfile build)
+_BAD_SF_CODES = {1, 3, 4}

-    PyAV wraps FFmpeg's C libraries in-process — no subprocess is
-    spawned, which is critical to avoid crashing CUDA-active vLLM
-    worker processes.

-    The returned waveform is at the native sample rate of the video's
-    audio stream.  Resampling to a model-specific rate is left to the
-    downstream :class:`AudioResampler` in the parsing pipeline.
+def load_audio_pyav(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
+) -> tuple[npt.NDArray, float]:
+    """Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
+
+    Decodes the audio stream at its native sample rate. Channel reduction to
+    mono is performed by averaging across channels.  Resampling to a
+    model-specific rate is left to the downstream :class:`AudioResampler`.

    Args:
-        data: Raw video file bytes (e.g. from an mp4 file).
+        path: A :class:`~io.BytesIO` buffer, a filesystem
+            :class:`~pathlib.Path`, or a string path.

    Returns:
-        A tuple of ``(waveform, sample_rate)`` suitable for use as an
-        :class:`AudioItem`.
+        ``(waveform, sample_rate)`` where *waveform* is a 1-D float32
+        NumPy array and *sample_rate* is the native sample rate in Hz.
    """
-    if data is None or len(data) == 0:
-        raise ValueError(
-            "Cannot extract audio: video bytes are missing or empty. "
-            "Ensure video was loaded with keep_video_bytes=True for "
-            "audio-in-video extraction."
-        )
+    native_sr = None
    try:
-        with av.open(BytesIO(data)) as container:
+        with av.open(path) as container:
            if not container.streams.audio:
-                raise ValueError("No audio stream found in the video.")
+                raise ValueError("No audio stream found.")
            stream = container.streams.audio[0]
+            stream.thread_type = "AUTO"
            native_sr = stream.rate
+            sr = sr or native_sr

            chunks: list[npt.NDArray] = []
-            for frame in container.decode(audio=0):
-                arr = frame.to_ndarray()
-                chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr)
+            needs_resampling = not math.isclose(
+                float(sr),
+                float(native_sr),
+                rel_tol=0.0,
+                abs_tol=1e-6,
+            )
+            resampler = (
+                av.AudioResampler(format="fltp", layout="mono", rate=sr)
+                if needs_resampling
+                else None
+            )
+            for frame in container.decode(stream):
+                if needs_resampling:
+                    assert resampler is not None
+                    for out_frame in resampler.resample(frame):
+                        chunks.append(out_frame.to_ndarray())
+                else:
+                    chunks.append(frame.to_ndarray())
    except ValueError:
        raise
    except Exception as e:
@@ -78,37 +100,54 @@ def extract_audio_from_video_bytes(
    if not chunks:
        raise ValueError("No audio found in the video.")

-    audio = np.concatenate(chunks).astype(np.float32)
-    return audio, float(native_sr)
+    audio = np.concatenate(chunks, axis=-1).astype(np.float32)
+    if mono and audio.ndim > 1:
+        audio = np.mean(audio, axis=0)

+    return audio, sr

-def is_video(data: bytes) -> bool:
-    """Check if the fetched bytes are video"""
-    if len(data) < 12:
-        return False

-    box_type = data[4:8]
-    major_brand = data[8:12]
+def load_audio_soundfile(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
+) -> tuple[np.ndarray, int]:
+    """Load audio via soundfile"""
+    with soundfile.SoundFile(path) as f:
+        native_sr = f.samplerate
+        y = f.read(dtype="float32", always_2d=False).T

-    MP4_BRANDS = {
-        b"mp41",
-        b"mp42",  # MP4
-        b"isom",  # ISO Base Media
-        b"iso2",
-        b"iso4",
-        b"iso5",
-        b"iso6",
-        b"M4V ",
-        b"M4A ",  # Apple
-        b"avc1",  # H.264
-        b"dash",  # DASH
-        b"mmp4",
-        b"MSNV",
-    }
+    if mono and y.ndim > 1:
+        y = np.mean(y, axis=tuple(range(y.ndim - 1)))

-    is_avi = data[:4] == b"RIFF" and major_brand == b"AVI "
-    is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS
-    return is_mp4 or is_avi
+    if sr is not None and sr != native_sr:
+        y = resampy.resample(y, sr_orig=native_sr, sr_new=sr)
+        return y, int(sr)
+    return y, native_sr
+
+
+def load_audio(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
+):
+    try:
+        return load_audio_soundfile(path, sr=sr, mono=mono)
+    except soundfile.LibsndfileError as exc:
+        # Only fall back for known format-detection failures.
+        # Re-raise anything else (e.g. corrupt but recognised format).
+        if exc.code not in _BAD_SF_CODES:
+            raise
+        # soundfile may have advanced the BytesIO seek position before failing;
+        # reset it so PyAV can read from the beginning.
+        if isinstance(path, BytesIO):
+            path.seek(0)
+        try:
+            return load_audio_pyav(path, sr=sr, mono=mono)
+        except Exception as pyav_exc:
+            raise ValueError("Invalid or unsupported audio file.") from pyav_exc


 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
@@ -129,19 +168,17 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
        self.kwargs = kwargs

    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
-        if is_video(data):
-            return extract_audio_from_video_bytes(data)
-        return librosa.load(BytesIO(data), sr=None)
+        return load_audio(BytesIO(data), sr=None)

    def load_base64(
        self,
        media_type: str,
        data: str,
    ) -> tuple[npt.NDArray, float]:
-        return self.load_bytes(base64.b64decode(data))
+        return self.load_bytes(pybase64.b64decode(data))

    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
-        return librosa.load(filepath, sr=None)
+        return load_audio(filepath, sr=None)

    def encode_base64(
        self,
@@ -155,7 +192,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
            soundfile.write(buffer, audio, sr, format=audio_format)
            data = buffer.getvalue()

-        return base64.b64encode(data).decode("utf-8")
+        return pybase64.b64encode(data).decode("utf-8")


 class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):

--- a/vllm/multimodal/media/video.py
+++ b/vllm/multimodal/media/video.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import base64
 from functools import partial
 from pathlib import Path
 from typing import Any

 import numpy as np
 import numpy.typing as npt
+import pybase64
 from PIL import Image

 from vllm import envs
@@ -80,11 +80,23 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
                "image/jpeg",
            )

-            return np.stack(
+            frames = np.stack(
                [np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
-            ), {}
-
-        return self.load_bytes(base64.b64decode(data))
+            )
+            total = int(frames.shape[0])
+            fps = float(self.kwargs.get("fps", 1))
+            duration = total / fps if fps > 0 else 0.0
+            metadata = {
+                "total_num_frames": total,
+                "fps": fps,
+                "duration": duration,
+                "video_backend": "jpeg_sequence",
+                "frames_indices": list(range(total)),
+                "do_sample_frames": False,
+            }
+            return frames, metadata
+
+        return self.load_bytes(pybase64.b64decode(data))

    def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
        with filepath.open("rb") as f:

--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -497,7 +497,7 @@ class MultiModalDataParser:
        *,
        target_sr: float | None = None,
        target_channels: int | None = None,
-        audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+        audio_resample_method: Literal["pyav", "scipy"] = "pyav",
        video_needs_metadata: bool = False,
        expected_hidden_size: int | None = None,
    ) -> None:

--- a/vllm/multimodal/processing/processor.py
+++ b/vllm/multimodal/processing/processor.py
@@ -1682,6 +1682,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):


 class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
+    skip_decoder_start_token: bool = False
+
    @abstractmethod
    def create_encoder_prompt(
        self,

--- a/vllm/parser/abstract_parser.py
+++ b/vllm/parser/abstract_parser.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import contextlib
 import json
 from abc import abstractmethod
 from collections.abc import Sequence
@@ -18,7 +19,7 @@ from openai.types.responses.response_output_text import Logprob
 from openai.types.responses.response_reasoning_item import (
    Content as ResponseReasoningTextContent,
 )
-from pydantic import TypeAdapter
+from pydantic import TypeAdapter, ValidationError

 from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.chat_completion.protocol import (
@@ -154,7 +155,9 @@ class Parser:
    @abstractmethod
    def extract_response_outputs(
        self,
+        *,
        model_output: str,
+        model_output_token_ids: Sequence[int],
        request: ResponsesRequest,
        enable_auto_tools: bool = False,
        tool_call_id_type: str = "random",
@@ -169,6 +172,7 @@ class Parser:

        Args:
            model_output: The complete model-generated string.
+            model_output_token_ids: The token IDs of the model output.
            request: The request object used to generate the output.
            enable_auto_tools: Whether to enable automatic tool call parsing.
            tool_call_id_type: Type of tool call ID generation ("random", etc).
@@ -195,7 +199,7 @@ class Parser:
            request: The request object used to generate the output.

        Returns:
-            A tuple of (reasoning_content, response_content).
+            A tuple of (reasoning, response_content).
        """

    @abstractmethod
@@ -312,7 +316,9 @@ class DelegatingParser(Parser):

    def extract_response_outputs(
        self,
+        *,
        model_output: str,
+        model_output_token_ids: Sequence[int],
        request: ResponsesRequest,
        enable_auto_tools: bool = False,
        tool_call_id_type: str = "random",
@@ -422,15 +428,19 @@ class DelegatingParser(Parser):

        if request.tool_choice == "required":
            # Required tool calls - parse JSON
-            assert content is not None
-            tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content)
-            function_calls.extend(
-                FunctionCall(
-                    name=tool_call.name,
-                    arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
+            tool_calls = []
+            with contextlib.suppress(ValidationError):
+                content = content or ""
+                tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(
+                    content
+                )
+            for tool_call in tool_calls:
+                function_calls.append(
+                    FunctionCall(
+                        name=tool_call.name,
+                        arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
+                    )
                )
-                for tool_call in tool_calls
-            )
            return function_calls, None  # Clear content since tool is called.

        if (

--- a/vllm/parser/parser_manager.py
+++ b/vllm/parser/parser_manager.py
@@ -199,7 +199,7 @@ class ParserManager:
        parser: type[ToolParser] | None = None
        if not enable_auto_tools or tool_parser_name is None:
            return parser
-        logger.info('"auto" tool choice has been enabled.')
+        logger.info_once('"auto" tool choice has been enabled.')

        try:
            if (

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -281,6 +281,9 @@ class CpuPlatform(Platform):
        # Disable multi-stream for shared experts as no Stream on CPU
        os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"

+        # Avoid inductor generates num_thread() and breaks the thread binding
+        os.environ["TORCHINDUCTOR_CPP_DYNAMIC_THREADS"] = "1"
+
        # Intel OpenMP setting
        ld_preload_str = os.getenv("LD_PRELOAD", "")
        if "libiomp5.so" in ld_preload_str:

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,6 +4,8 @@
 pynvml. However, it should not initialize cuda context.
 """

+from __future__ import annotations
+
 import os
 from collections.abc import Callable
 from datetime import timedelta
@@ -17,6 +19,7 @@ from typing_extensions import ParamSpec

 # import custom ops, trigger op registration
 import vllm._C  # noqa
+import vllm._C_stable_libtorch  # noqa
 from vllm.logger import init_logger
 from vllm.utils.import_utils import import_pynvml
 from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -49,21 +52,34 @@ def _get_backend_priorities(
    use_mla: bool,
    device_capability: DeviceCapability,
    num_heads: int | None = None,
+    kv_cache_dtype: CacheDType | None = None,
 ) -> list[AttentionBackendEnum]:
    """Get backend priorities with lazy import to avoid circular dependency."""
    if use_mla:
        if device_capability.major == 10:
-            # Prefer FlashInfer at low head counts (FlashMLA uses padding)
-            if num_heads is not None and num_heads <= 16:
+            # Sparse MLA backend priorities
+            # See https://github.com/vllm-project/vllm/issues/35807 for
+            # benchmark results
+            if kv_cache_dtype is not None and kv_cache_dtype.startswith("fp8"):
+                # Prefer FlashInfer for fp8 kv cache
                sparse_backends = [
                    AttentionBackendEnum.FLASHINFER_MLA_SPARSE,
                    AttentionBackendEnum.FLASHMLA_SPARSE,
                ]
            else:
-                sparse_backends = [
-                    AttentionBackendEnum.FLASHMLA_SPARSE,
-                    AttentionBackendEnum.FLASHINFER_MLA_SPARSE,
-                ]
+                # BF16 KV Cache
+                # Prefer FlashInfer at low head counts (FlashMLA uses padding)
+                if num_heads is not None and num_heads <= 16:
+                    sparse_backends = [
+                        AttentionBackendEnum.FLASHINFER_MLA_SPARSE,
+                        AttentionBackendEnum.FLASHMLA_SPARSE,
+                    ]
+                else:
+                    sparse_backends = [
+                        AttentionBackendEnum.FLASHMLA_SPARSE,
+                        AttentionBackendEnum.FLASHINFER_MLA_SPARSE,
+                    ]
+
            return [
                AttentionBackendEnum.FLASHINFER_MLA,
                AttentionBackendEnum.CUTLASS_MLA,
@@ -165,7 +181,7 @@ class CudaPlatformBase(Platform):
        pass

    @classmethod
-    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
        parallel_config = vllm_config.parallel_config
        model_config = vllm_config.model_config

@@ -198,11 +214,11 @@ class CudaPlatformBase(Platform):
    def get_valid_backends(
        cls,
        device_capability: DeviceCapability,
-        attn_selector_config: "AttentionSelectorConfig",
+        attn_selector_config: AttentionSelectorConfig,
        num_heads: int | None = None,
    ) -> tuple[
-        list[tuple["AttentionBackendEnum", int]],
-        dict["AttentionBackendEnum", tuple[int, list[str]]],
+        list[tuple[AttentionBackendEnum, int]],
+        dict[AttentionBackendEnum, tuple[int, list[str]]],
    ]:
        valid_backends_priorities = []
        invalid_reasons: dict[AttentionBackendEnum, tuple[int, list[str]]] = {}
@@ -211,6 +227,7 @@ class CudaPlatformBase(Platform):
            attn_selector_config.use_mla,
            device_capability,
            num_heads,
+            attn_selector_config.kv_cache_dtype,
        )
        for priority, backend in enumerate(backend_priorities):
            try:
@@ -231,8 +248,8 @@ class CudaPlatformBase(Platform):
    @classmethod
    def get_attn_backend_cls(
        cls,
-        selected_backend: "AttentionBackendEnum | None",
-        attn_selector_config: "AttentionSelectorConfig",
+        selected_backend: AttentionBackendEnum | None,
+        attn_selector_config: AttentionSelectorConfig,
        num_heads: int | None = None,
    ) -> str:
        device_capability = cls.get_device_capability()
@@ -324,7 +341,7 @@ class CudaPlatformBase(Platform):
        return selected_backend.get_path()

    @classmethod
-    def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
+    def get_supported_vit_attn_backends(cls) -> list[AttentionBackendEnum]:
        if cls.has_device_capability(80):
            return [
                AttentionBackendEnum.FLASH_ATTN,
@@ -345,8 +362,8 @@ class CudaPlatformBase(Platform):
        cls,
        head_size: int,
        dtype: torch.dtype,
-        backend: "AttentionBackendEnum | None" = None,
-    ) -> "AttentionBackendEnum":
+        backend: AttentionBackendEnum | None = None,
+    ) -> AttentionBackendEnum:
        if backend is not None:
            assert backend in cls.get_supported_vit_attn_backends(), (
                f"Backend {backend} is not supported for vit attention. "
@@ -371,7 +388,8 @@ class CudaPlatformBase(Platform):
                    )
                if is_backend_supported:
                    logger.info_once(
-                        f"Using backend {vit_attn_backend} for vit attention"
+                        f"Using backend {vit_attn_backend} for vit attention",
+                        scope="local",
                    )
                    return vit_attn_backend
            except ImportError:
@@ -493,6 +511,11 @@ class CudaPlatformBase(Platform):
    def support_static_graph_mode(cls) -> bool:
        return True

+    @classmethod
+    def support_deep_gemm(cls) -> bool:
+        """Currently, only Hopper and Blackwell GPUs are supported."""
+        return cls.is_device_capability(90) or cls.is_device_capability_family(100)
+
    @classmethod
    def num_compute_units(cls, device_id: int = 0) -> int:
        return torch.cuda.get_device_properties(device_id).multi_processor_count

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -712,6 +712,13 @@ class Platform:
        """
        return False

+    @classmethod
+    def support_deep_gemm(cls) -> bool:
+        """
+        Returns if DeepGEMM is supported by the current platform.
+        """
+        return False
+
    @classmethod
    def use_custom_op_collectives(cls) -> bool:
        """

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -28,6 +28,7 @@ try:
    from amdsmi import (
        AmdSmiException,
        amdsmi_get_gpu_asic_info,
+        amdsmi_get_gpu_device_uuid,
        amdsmi_get_processor_handles,
        amdsmi_init,
        amdsmi_shut_down,
@@ -439,8 +440,6 @@ class RocmPlatform(Platform):
        device_capability = cls.get_device_capability()
        assert device_capability is not None

-        attn_selector_config = attn_selector_config._replace(block_size=None)
-
        # First try checking just the selected backend, if there is one.
        if selected_backend is not None:
            try:
@@ -611,6 +610,20 @@ class RocmPlatform(Platform):
            return _ROCM_DEVICE_ID_NAME_MAP[device_name]
        return asic_info["market_name"]

+    @classmethod
+    @with_amdsmi_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        try:
+            device = amdsmi_get_processor_handles()[device_id]
+        except AmdSmiException as error:
+            logger.error("amdsmi device query failed ", exc_info=error)
+            return ""
+        try:
+            device_uuid = amdsmi_get_gpu_device_uuid(device)
+        except AmdSmiException as error:
+            logger.error("amdsmi device uuid query failed ", exc_info=error)
+        return device_uuid
+
    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        device_props = torch.cuda.get_device_properties(device_id)
@@ -668,7 +681,6 @@ class RocmPlatform(Platform):
    def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
        from vllm.config.compilation import CUDAGraphMode

-        cache_config = vllm_config.cache_config
        compilation_config = vllm_config.compilation_config
        parallel_config = vllm_config.parallel_config

@@ -690,32 +702,9 @@ class RocmPlatform(Platform):
                )
                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

-        if cache_config and not cache_config.user_specified_block_size:
-            if (
-                envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER
-                # NOTE: This block has been deprecated
-                # or get_env_variable_attn_backend()
-                # == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN
-                # TODO: monitor https://github.com/vllm-project/vllm/pull/30396
-                # to see how we can transition to the new way of selecting
-                # attention backends
-            ):
-                cache_config.block_size = 64
-                logger.warning(
-                    "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
-                )
-            else:
-                cache_config.block_size = 16
-
        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"

-    @classmethod
-    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
-        # TODO: ROCm still sets block_size in check_and_update_config.
-        # Move that logic here so block_size is chosen by the backend.
-        pass
-
    @classmethod
    def verify_model_arch(cls, model_arch: str) -> None:
        if model_arch in _ROCM_UNSUPPORTED_MODELS: