Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

0da93439 · zhuwenwen · 25f2f756 · 298e5108 · 0da93439 · 0da93439
Commit 0da93439 authored Mar 26, 2026 by zhuwenwen
13 changed files
--- a/vllm/transformers_utils/configs/qwen3_5_moe.py
+++ b/vllm/transformers_utils/configs/qwen3_5_moe.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 """Qwen3.5-MoE model configuration"""

-from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.configuration_utils import PretrainedConfig


 class Qwen3_5MoeTextConfig(PretrainedConfig):
@@ -75,10 +75,6 @@ class Qwen3_5MoeTextConfig(PretrainedConfig):
        eos_token_id=None,
        **kwargs,
    ):
-        kwargs["ignore_keys_at_rope_validation"] = [
-            "mrope_section",
-            "mrope_interleaved",
-        ]
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
@@ -104,7 +100,18 @@ class Qwen3_5MoeTextConfig(PretrainedConfig):
                else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
-        layer_type_validation(self.layer_types, self.num_hidden_layers)
+        if hasattr(self, "validate_layer_type"):
+            # Transformers v5
+            kwargs["ignore_keys_at_rope_validation"] = {
+                "mrope_section",
+                "mrope_interleaved",
+            }
+            self.validate_layer_type()
+        else:
+            # Transformers v4
+            from transformers.configuration_utils import layer_type_validation
+
+            layer_type_validation(self.layer_types, self.num_hidden_layers)

        # linear attention part
        self.linear_conv_kernel_dim = linear_conv_kernel_dim

--- a/vllm/transformers_utils/configs/qwen3_asr.py
+++ b/vllm/transformers_utils/configs/qwen3_asr.py
@@ -408,7 +408,6 @@ class Qwen3ASRConfig(PretrainedConfig):
        support_languages=None,
        **kwargs,
    ):
-        super().__init__(**kwargs)
        if thinker_config is None:
            thinker_config = {}
            logger.info(
@@ -417,6 +416,7 @@ class Qwen3ASRConfig(PretrainedConfig):

        self.thinker_config = Qwen3ASRThinkerConfig(**thinker_config)
        self.support_languages = support_languages
+        super().__init__(**kwargs)

    def get_text_config(self, decoder=False) -> "PretrainedConfig":
        """

--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 """Qwen3-Next model configuration"""

-from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging

 logger = logging.get_logger(__name__)
@@ -253,7 +253,14 @@ class Qwen3NextConfig(PretrainedConfig):
                "linear_attention" if bool((i + 1) % 4) else "full_attention"
                for i in range(self.num_hidden_layers)
            ]
-        layer_type_validation(self.layer_types)
+        if hasattr(self, "validate_layer_type"):
+            # Transformers v5
+            self.validate_layer_type()
+        else:
+            # Transformers v4
+            from transformers.configuration_utils import layer_type_validation
+
+            layer_type_validation(self.layer_types)

        # linear attention part
        self.linear_conv_kernel_dim = linear_conv_kernel_dim

--- a/vllm/transformers_utils/configs/radio.py
+++ b/vllm/transformers_utils/configs/radio.py
@@ -47,6 +47,14 @@ class RadioConfig(PretrainedConfig):
        teachers: A list of teacher model configurations. Each teacher configuration is
            a dict with keys like "name" and some may have "use_summary".
        cls_token_per_teacher: Whether to use a separate CLS token for each teacher.
+        video_temporal_patch_size: Number of consecutive video frames grouped into
+            a single tubelet for temporal compression. Default 1 (no compression).
+            When > 1, a dedicated video_embedder (3*T*P*P -> hidden) is created
+            alongside the image embedder (3*P*P -> hidden).
+        separate_video_embedder: When True and video_temporal_patch_size > 1, use a
+            dedicated video patch embedder (3*T*P*P -> hidden) separate from the
+            image embedder (3*P*P -> hidden). When False, a single embedder with
+            input size 3*T*P*P is used for both (images are duplicated T times).
    """

    model_type = "radio"
@@ -68,6 +76,8 @@ class RadioConfig(PretrainedConfig):
        register_multiple: int | None = None,
        teachers: list[dict[str, Any]] | None = None,
        cls_token_per_teacher: bool = False,
+        video_temporal_patch_size: int = 1,
+        separate_video_embedder: bool = True,
        **kwargs,
    ):
        self.model_name = model_name
@@ -95,4 +105,6 @@ class RadioConfig(PretrainedConfig):
        self.register_multiple = register_multiple
        self.teachers = teachers if teachers is not None else []
        self.cls_token_per_teacher = cls_token_per_teacher
+        self.video_temporal_patch_size = video_temporal_patch_size
+        self.separate_video_embedder = separate_video_embedder
        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/speculators/__init__.py
+++ b/vllm/transformers_utils/configs/speculators/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from .base import SpeculatorsConfig
+
+__all__ = ["SpeculatorsConfig"]
--- a/vllm/transformers_utils/configs/speculators/base.py
+++ b/vllm/transformers_utils/configs/speculators/base.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from dataclasses import fields, is_dataclass
 from typing import Any

 from transformers import PretrainedConfig
@@ -8,15 +9,29 @@ from transformers import PretrainedConfig
 from vllm.transformers_utils.configs.speculators.algos import (
    SUPPORTED_SPECULATORS_TYPES,
 )
-
-__all__ = ["SpeculatorsConfig"]
-
 from vllm.transformers_utils.utils import without_trust_remote_code


 class SpeculatorsConfig(PretrainedConfig):
    model_type = "speculators"

+    def __init__(self, **kwargs):
+        # Transformers v4 - super().__init__ which sets all kwargs as attributes
+        if not is_dataclass(PretrainedConfig):
+            return super().__init__(**kwargs)
+        # Transformers v5 - super().__init__ performs some validation before
+        # setting all kwargs as attributes, so we set them first to be safe
+        pre_trained_config_fields = {f.name for f in fields(PretrainedConfig)}
+        super_kwargs = dict()
+        for key, value in kwargs.items():
+            if key == "model_type":
+                continue  # model_type is set as a class variable, so skip it here
+            elif key in pre_trained_config_fields:
+                super_kwargs[key] = value
+            else:
+                setattr(self, key, value)
+        super().__init__(**super_kwargs)
+
    @classmethod
    def from_pretrained(
        cls,

--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -43,7 +43,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
            use `False`, but v0.5 and above use `True`.
    """

-    wrapped_model_config: transformers.PretrainedConfig
    model_type = "ultravox"
    audio_token = "<|audio|>"
    is_composition = False
@@ -75,6 +74,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
        self.num_projector_layers = num_projector_layers

        # N.B. May set the wrapped_model_config below.
+        self.wrapped_model_config: transformers.PretrainedConfig
        self.text_model_id = text_model_id
        if text_model_id is None:
            text_config = text_config or {}

--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -228,7 +228,7 @@ class ModelArchConfigConvertorBase:
            "pangu_ultra_moe_mtp",
            "bailing_hybrid",
        ):
-            return self.hf_text_config.kv_lora_rank is not None
+            return getattr(self.hf_text_config, "kv_lora_rank", None) is not None
        elif self.hf_text_config.model_type == "eagle":
            # if the model is an EAGLE module, check for the
            # underlying architecture
@@ -241,7 +241,7 @@ class ModelArchConfigConvertorBase:
                    "deepseek_v32",
                    "deepseek_mtp",
                )
-                and self.hf_text_config.kv_lora_rank is not None
+                and getattr(self.hf_text_config, "kv_lora_rank", None) is not None
            )
        return False

@@ -300,6 +300,28 @@ class ModelArchConfigConvertorBase:
        return model_arch_config


+class CohereAsrModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_total_num_attention_heads(self) -> int:
+        return self.hf_text_config.transf_decoder["config_dict"]["num_attention_heads"]
+
+    def get_head_size(self) -> int:
+        hidden_size = self.hf_text_config.transf_decoder["config_dict"]["hidden_size"]
+        num_attention_heads = self.hf_text_config.transf_decoder["config_dict"][
+            "num_attention_heads"
+        ]
+        return hidden_size // num_attention_heads
+
+    def get_total_num_kv_heads(self) -> int:
+        enc_num_kv_heads = self.hf_text_config.encoder["n_heads"]
+        dec_num_kv_heads = self.hf_text_config.transf_decoder["config_dict"][
+            "num_attention_heads"
+        ]
+        assert enc_num_kv_heads == dec_num_kv_heads, (
+            "Encoder and decoder must have the same number of kv heads"
+        )
+        return enc_num_kv_heads
+
+
 class MambaModelArchConfigConvertor(ModelArchConfigConvertorBase):
    def get_head_size(self) -> int:
        return 0
@@ -425,6 +447,7 @@ class LongCatFlashMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):

 # hf_config.model_type -> convertor class
 MODEL_ARCH_CONFIG_CONVERTORS = {
+    "cohere_asr": CohereAsrModelArchConfigConvertor,
    "mamba": MambaModelArchConfigConvertor,
    "falcon_mamba": MambaModelArchConfigConvertor,
    "timm_wrapper": TerratorchModelArchConfigConvertor,

--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -12,36 +12,56 @@ import importlib

 __all__ = [
    "BagelProcessor",
+    "CohereASRProcessor",
    "DeepseekVLV2Processor",
    "FireRedASR2Processor",
    "FunASRProcessor",
    "GLM4VProcessor",
+    "H2OVLProcessor",
    "HunYuanVLProcessor",
    "HunYuanVLImageProcessor",
+    "InternVLProcessor",
+    "IsaacProcessor",
    "KimiAudioProcessor",
+    "KimiK25Processor",
    "MistralCommonPixtralProcessor",
    "MistralCommonVoxtralProcessor",
+    "NanoNemotronVLProcessor",
+    "NemotronVLProcessor",
+    "LlamaNemotronVLEmbedProcessor",
+    "NVLMProcessor",
    "OvisProcessor",
    "Ovis2_5Processor",
    "QwenVLProcessor",
    "Qwen3ASRProcessor",
+    "Step3VLProcessor",
 ]

 _CLASS_TO_MODULE: dict[str, str] = {
    "BagelProcessor": "vllm.transformers_utils.processors.bagel",
+    "CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
    "DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
    "FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
    "FunASRProcessor": "vllm.transformers_utils.processors.funasr",
    "GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
+    "H2OVLProcessor": "vllm.transformers_utils.processors.h2ovl",
    "HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
    "HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
+    "InternVLProcessor": "vllm.transformers_utils.processors.internvl",
+    "IsaacProcessor": "vllm.transformers_utils.processors.isaac",
    "KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio",
+    "KimiK25Processor": "vllm.transformers_utils.processors.kimi_k25",
    "MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
    "MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
+    "NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
+    "NemotronVLProcessor": "vllm.transformers_utils.processors.nemotron_vl",
+    "LlamaNemotronVLEmbedProcessor": "vllm.transformers_utils.processors.nemotron_vl",
+    "NVLMProcessor": "vllm.transformers_utils.processors.nvlm_d",
    "OvisProcessor": "vllm.transformers_utils.processors.ovis",
    "Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
    "QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
    "Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
+    "Step3VLProcessor": "vllm.transformers_utils.processors.step3_vl",
 }



--- a/vllm/transformers_utils/processors/cohere_asr.py
+++ b/vllm/transformers_utils/processors/cohere_asr.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+import math
+import random
+
+import librosa
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoFeatureExtractor, AutoProcessor, BatchFeature
+from transformers.feature_extraction_sequence_utils import (
+    SequenceFeatureExtractor,
+)
+from transformers.processing_utils import ProcessorMixin
+
+logger = logging.getLogger(__name__)
+
+CONSTANT = 1e-5
+INF_VAL = 10000.0
+
+
+class FilterbankFeatures(nn.Module):
+    """Featurizer that converts wavs to Mel Spectrograms.
+    See AudioToMelSpectrogramPreprocessor for args.
+    """
+
+    window: torch.Tensor
+    fb: torch.Tensor
+
+    def __init__(
+        self,
+        sample_rate=16000,
+        n_window_size=320,
+        n_window_stride=160,
+        window="hann",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        nfilt=64,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2**-24,
+        dither=CONSTANT,
+        pad_to=16,
+        max_duration=30,
+        frame_splicing=1,
+        exact_pad=False,
+        pad_value=0,
+        mag_power=2.0,
+        use_grads=False,
+        rng=None,
+        nb_augmentation_prob=0.0,
+        nb_max_freq=4000,
+        mel_norm="slaney",
+        stft_exact_pad=False,
+        stft_conv=False,
+        device="cpu",
+    ):
+        super().__init__()
+        if stft_conv or stft_exact_pad:
+            logger.warning(
+                "Using torch_stft is deprecated and has been removed. "
+                "The values have been forcibly set to False for "
+                "FilterbankFeatures and AudioToMelSpectrogramPreprocessor. "
+                "Please set exact_pad to True as needed."
+            )
+        if exact_pad and n_window_stride % 2 == 1:
+            raise NotImplementedError(
+                f"{self} received exact_pad == True, but hop_size was odd. "
+                "If audio_length % hop_size == 0, the returned spectrogram "
+                "would not be of length audio_length // hop_size. "
+                "Please use an even hop_size."
+            )
+        self.log_zero_guard_value = log_zero_guard_value
+        if (
+            n_window_size is None
+            or n_window_stride is None
+            or not isinstance(n_window_size, int)
+            or not isinstance(n_window_stride, int)
+            or n_window_size <= 0
+            or n_window_stride <= 0
+        ):
+            raise ValueError(
+                f"{self} got an invalid value for either n_window_size or "
+                f"n_window_stride. Both must be positive ints."
+            )
+
+        self.sample_rate = sample_rate
+        self.win_length = n_window_size
+        self.hop_length = n_window_stride
+        self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
+        self.stft_pad_amount = (
+            (self.n_fft - self.hop_length) // 2 if exact_pad else None
+        )
+        self.exact_pad = exact_pad
+        self.sample_rate = sample_rate
+        self.max_duration = max_duration
+
+        if exact_pad:
+            logger.info("STFT using exact pad")
+        torch_windows = {
+            "hann": torch.hann_window,
+            "hamming": torch.hamming_window,
+            "blackman": torch.blackman_window,
+            "bartlett": torch.bartlett_window,
+            "none": None,
+        }
+        window_fn = torch_windows.get(window)
+        window_tensor = (
+            window_fn(self.win_length, periodic=False) if window_fn else None
+        )
+        self.register_buffer("window", window_tensor)
+
+        self.normalize = normalize
+        self.log = log
+        self.dither = dither
+        self.frame_splicing = frame_splicing
+        self.nfilt = nfilt
+        self.preemph = preemph
+        self.pad_to = pad_to
+        highfreq = highfreq or sample_rate / 2
+        self.sample_rate = sample_rate
+        # disable pad min duration
+        # self.pad_min_duration = 1.0
+        self.pad_min_duration = 0.0
+        self.pad_direction = "both"
+
+        filterbanks = torch.tensor(
+            librosa.filters.mel(
+                sr=sample_rate,
+                n_fft=self.n_fft,
+                n_mels=nfilt,
+                fmin=lowfreq,
+                fmax=highfreq,
+                norm=mel_norm,
+            ),
+            dtype=torch.float,
+        ).unsqueeze(0)
+        self.register_buffer("fb", filterbanks)
+
+        # Calculate maximum sequence length
+        max_length = self.get_seq_len(
+            torch.tensor(max_duration * sample_rate, dtype=torch.float)
+        )
+        max_pad = pad_to - (max_length % pad_to) if pad_to > 0 else 0
+        self.max_length = max_length + max_pad
+        self.pad_value = pad_value
+        self.mag_power = mag_power
+
+        # We want to avoid taking the log of zero
+        # There are two options: either adding or clamping to a small value
+        if log_zero_guard_type not in ["add", "clamp"]:
+            raise ValueError(
+                f"{self} received {log_zero_guard_type} for the "
+                f"log_zero_guard_type parameter. It must be either 'add' or "
+                f"'clamp'."
+            )
+
+        self.use_grads = use_grads
+        if not use_grads:
+            self.forward = torch.no_grad()(self.forward)
+        self._rng = random.Random() if rng is None else rng
+        self.nb_augmentation_prob = nb_augmentation_prob
+        if self.nb_augmentation_prob > 0.0:
+            if nb_max_freq >= sample_rate / 2:
+                self.nb_augmentation_prob = 0.0
+            else:
+                self._nb_max_fft_bin = int((nb_max_freq / sample_rate) * n_fft)
+
+        # log_zero_guard_value is the the small we want to use, we support
+        # an actual number, or "tiny", or "eps"
+        self.log_zero_guard_type = log_zero_guard_type
+
+        assert self.window is not None
+        assert self.fb is not None
+        self.window = self.window.to(dtype=torch.bfloat16)
+        self.fb = self.fb.to(dtype=torch.bfloat16)
+
+        self.generator = torch.Generator(device=device)
+        self.generator.manual_seed(0)
+
+    @torch._dynamo.disable
+    def stft(self, x):
+        # disable autocast to get full range of stft values
+        with torch.amp.autocast(x.device.type, enabled=False):
+            return torch.stft(
+                x,
+                n_fft=self.n_fft,
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                center=not self.exact_pad,
+                window=self.window.to(dtype=torch.float, device=x.device),
+                return_complex=True,
+                pad_mode="constant",
+            )
+
+    def log_zero_guard_value_fn(self, x):
+        if isinstance(self.log_zero_guard_value, str):
+            if self.log_zero_guard_value == "tiny":
+                return torch.finfo(x.dtype).tiny
+            elif self.log_zero_guard_value == "eps":
+                return torch.finfo(x.dtype).eps
+            else:
+                raise ValueError(
+                    f"{self} received {self.log_zero_guard_value} for the "
+                    f"log_zero_guard_type parameter. It must be either a "
+                    f"number, 'tiny', or 'eps'"
+                )
+        else:
+            return self.log_zero_guard_value
+
+    def get_seq_len(self, seq_len):
+        # Assuming that center is True is stft_pad_amount = 0
+        pad_amount = (
+            self.stft_pad_amount * 2
+            if self.stft_pad_amount is not None
+            else self.n_fft // 2 * 2
+        )
+        seq_len = torch.floor_divide(
+            (seq_len + pad_amount - self.n_fft), self.hop_length
+        )
+        return seq_len.to(dtype=torch.long)
+
+    @property
+    def filter_banks(self):
+        return self.fb
+
+    def splice_frames(self, x, frame_splicing):
+        """Stacks frames together across feature dim
+
+        input is batch_size, feature_dim, num_frames
+        output is batch_size, feature_dim*frame_splicing, num_frames
+
+        """
+        seq = [x]
+        for n in range(1, frame_splicing):
+            seq.append(torch.cat([x[:, :, :n], x[:, :, n:]], dim=2))
+        return torch.cat(seq, dim=1)
+
+    def normalize_batch(self, x, seq_len, normalize_type):
+        x_mean = None
+        x_std = None
+        if normalize_type == "per_feature":
+            batch_size = x.shape[0]
+            max_time = x.shape[2]
+
+            # When doing stream capture to a graph, item() is not allowed
+            # because it calls cudaStreamSynchronize(). Therefore, we are
+            # sacrificing some error checking when running with cuda graphs.
+            # if (
+            #     torch.cuda.is_available()
+            #     and not torch.cuda.is_current_stream_capturing()
+            #     and torch.any(seq_len == 1).item()
+            # ):
+            #     raise ValueError(
+            #         "normalize_batch with `per_feature` normalize_type "
+            #         "received a tensor of length 1. This will result in "
+            #         "torch.std() returning nan. Make sure your audio length "
+            #         "has enough samples for a single feature (ex. at least "
+            #         "`hop_length` for Mel Spectrograms)."
+            #     )
+            time_steps = (
+                torch.arange(max_time, device=x.device)
+                .unsqueeze(0)
+                .expand(batch_size, max_time)
+            )
+            valid_mask = time_steps < seq_len.unsqueeze(1)
+            x_mean_numerator = torch.where(valid_mask.unsqueeze(1), x, 0.0).sum(axis=2)
+            x_mean_denominator = valid_mask.sum(axis=1)
+            x_mean = x_mean_numerator / x_mean_denominator.unsqueeze(1)
+
+            # Subtract 1 in the denominator to correct for the bias.
+            x_std = torch.sqrt(
+                torch.sum(
+                    torch.where(valid_mask.unsqueeze(1), x - x_mean.unsqueeze(2), 0.0)
+                    ** 2,
+                    axis=2,
+                )
+                / (x_mean_denominator.unsqueeze(1) - 1.0)
+            )
+            x_std = x_std.masked_fill(
+                x_std.isnan(), 0.0
+            )  # edge case: only 1 frame in denominator
+            # make sure x_std is not zero
+            x_std += CONSTANT
+            return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std
+        elif normalize_type == "all_features":
+            x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+            x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
+            for i in range(x.shape[0]):
+                x_mean[i] = x[i, :, : seq_len[i].item()].mean()
+                x_std[i] = x[i, :, : seq_len[i].item()].std()
+            # make sure x_std is not zero
+            x_std += CONSTANT
+            return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1), x_mean, x_std
+        elif "fixed_mean" in normalize_type and "fixed_std" in normalize_type:
+            x_mean = torch.tensor(normalize_type["fixed_mean"], device=x.device)
+            x_std = torch.tensor(normalize_type["fixed_std"], device=x.device)
+            return (
+                (x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2))
+                / x_std.view(x.shape[0], x.shape[1]).unsqueeze(2),
+                x_mean,
+                x_std,
+            )
+        else:
+            return x, x_mean, x_std
+
+    @torch.compile
+    def forward(self, x, seq_len, linear_spec=False):
+        if x.shape[1] < self.sample_rate * self.pad_min_duration:
+            pad_amount = int(self.sample_rate * self.pad_min_duration) - x.shape[1]
+            if self.pad_direction == "right":
+                x = F.pad(x, (0, pad_amount), value=self.pad_value)
+            elif self.pad_direction == "left":
+                x = F.pad(x, (pad_amount, 0), value=self.pad_value)
+            elif self.pad_direction == "both":
+                left_pad = pad_amount // 2
+                right_pad = pad_amount - left_pad
+                x = F.pad(x, (left_pad, right_pad), value=self.pad_value)
+            else:
+                raise ValueError(
+                    f"{self} received an invalid pad_direction: {self.pad_direction}. "
+                    f"It must be one of 'left', 'right', or 'both'."
+                )
+            seq_len = torch.tensor([x.shape[1]], dtype=torch.float, device=x.device)
+
+        seq_len_time = seq_len
+        seq_len_unfixed = self.get_seq_len(seq_len)
+
+        # fix for seq_len = 0 for streaming; if size was 0, it is always padded
+        # to 1, and normalizer fails
+        seq_len = torch.where(
+            seq_len == 0, torch.zeros_like(seq_len_unfixed), seq_len_unfixed
+        )
+
+        if self.stft_pad_amount is not None:
+            x = torch.nn.functional.pad(
+                x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "constant"
+            ).squeeze(1)
+
+        # use dither for inference as well
+        if self.dither > 0:
+            x += self.dither * torch.randn(
+                x.shape, dtype=x.dtype, device=x.device, generator=self.generator
+            )
+
+        # do preemphasis
+        if self.preemph is not None:
+            timemask = torch.arange(x.shape[1], device=x.device).unsqueeze(
+                0
+            ) < seq_len_time.unsqueeze(1)
+            x = torch.cat(
+                (x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1
+            )
+
+            x = x.masked_fill(~timemask, 0.0)
+
+        x = self.stft(x)
+
+        # torch stft returns complex tensor (of shape [B,N,T]); so convert to magnitude
+        # guard is needed for sqrt if grads are passed through
+        guard = 0 if not self.use_grads else CONSTANT
+        x = torch.view_as_real(x)
+        x = torch.sqrt(x.pow(2).sum(-1) + guard)
+
+        # get power spectrum
+        if self.mag_power != 1.0:
+            x = x.pow(self.mag_power)
+
+        # return plain spectrogram if required
+        if linear_spec:
+            return x, seq_len
+
+        # disable autocast, otherwise it might be automatically casted to fp16
+        # on fp16 compatible GPUs and get NaN values for input value of 65520
+        with torch.amp.autocast(x.device.type, enabled=False):
+            # dot with filterbank energies
+            x = torch.matmul(self.fb.to(x.dtype), x)
+
+        # log features if required
+        if self.log:
+            if self.log_zero_guard_type == "add":
+                x = torch.log(x + self.log_zero_guard_value_fn(x))
+            elif self.log_zero_guard_type == "clamp":
+                x = torch.log(torch.clamp(x, min=self.log_zero_guard_value_fn(x)))
+            else:
+                raise ValueError("log_zero_guard_type was not understood")
+
+        # frame splicing if required
+        if self.frame_splicing > 1:
+            x = self.splice_frames(x, self.frame_splicing)
+
+        # normalize if required
+        if self.normalize:
+            x, _, _ = self.normalize_batch(x, seq_len, normalize_type=self.normalize)
+
+        # mask to zero any values beyond seq_len in batch, pad to multiple of
+        # `pad_to` (for efficiency)
+        max_len = x.size(-1)
+        mask = torch.arange(max_len, device=x.device)
+        mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1)
+        x = x.masked_fill(
+            mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value
+        )
+
+        del mask
+        pad_to = self.pad_to
+        if pad_to == "max":
+            x = nn.functional.pad(
+                x, (0, self.max_length - x.size(-1)), value=self.pad_value
+            )
+        elif pad_to > 0:
+            pad_amt = x.size(-1) % pad_to
+            if pad_amt != 0:
+                x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
+
+        return x, seq_len
+
+
+class CohereASRFeatureExtractor(SequenceFeatureExtractor):
+    """HF-compatible feature extractor wrapping FilterbankFeatures."""
+
+    model_input_names = ["input_features"]
+
+    def __init__(
+        self,
+        feature_size=64,
+        sampling_rate=16000,
+        padding_value=0.0,
+        max_duration=30,
+        n_window_size=320,
+        n_window_stride=160,
+        window="hann",
+        normalize="per_feature",
+        n_fft=None,
+        preemph=0.97,
+        lowfreq=0,
+        highfreq=None,
+        log=True,
+        log_zero_guard_type="add",
+        log_zero_guard_value=2**-24,
+        dither=CONSTANT,
+        pad_to=16,
+        frame_splicing=1,
+        exact_pad=False,
+        mag_power=2.0,
+        nb_augmentation_prob=0.0,
+        nb_max_freq=4000,
+        mel_norm="slaney",
+        stft_exact_pad=False,
+        stft_conv=False,
+        device="cpu",
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            **kwargs,
+        )
+        self.max_duration = max_duration
+        self.hop_length = n_window_stride
+        self._device = torch.device(device)
+        self._fb_config = dict(
+            sample_rate=sampling_rate,
+            n_window_size=n_window_size,
+            n_window_stride=n_window_stride,
+            window=window,
+            normalize=normalize,
+            n_fft=n_fft,
+            preemph=preemph,
+            nfilt=feature_size,
+            lowfreq=lowfreq,
+            highfreq=highfreq,
+            log=log,
+            log_zero_guard_type=log_zero_guard_type,
+            log_zero_guard_value=log_zero_guard_value,
+            dither=dither,
+            pad_to=pad_to,
+            max_duration=max_duration,
+            frame_splicing=frame_splicing,
+            exact_pad=exact_pad,
+            pad_value=padding_value,
+            mag_power=mag_power,
+            nb_augmentation_prob=nb_augmentation_prob,
+            nb_max_freq=nb_max_freq,
+            mel_norm=mel_norm,
+            stft_exact_pad=stft_exact_pad,
+            stft_conv=stft_conv,
+            device=device,
+        )
+        self._filterbank: FilterbankFeatures | None = None
+
+    @property
+    def filterbank(self) -> FilterbankFeatures:
+        if self._filterbank is None:
+            fb = FilterbankFeatures(**self._fb_config)
+            fb.eval()
+            self._filterbank = fb.to(self._device)
+        return self._filterbank
+
+    def get_seq_len(self, seq_len):
+        return self.filterbank.get_seq_len(seq_len)
+
+    def __call__(
+        self,
+        raw_speech,
+        sampling_rate=None,
+        return_tensors=None,
+        **kwargs,
+    ) -> BatchFeature:
+        if isinstance(raw_speech, np.ndarray):
+            raw_speech = [raw_speech]
+
+        seq_len = torch.tensor([s.shape[0] for s in raw_speech])
+
+        max_len = max(s.shape[0] for s in raw_speech)
+        padded = np.zeros((len(raw_speech), max_len), dtype=np.float32)
+        for i, s in enumerate(raw_speech):
+            padded[i, : s.shape[0]] = s
+
+        audio_tensor = torch.from_numpy(padded).to(self._device)
+        seq_len = seq_len.to(self._device)
+
+        with torch.no_grad():
+            input_features, length = self.filterbank(audio_tensor, seq_len)
+
+        result = BatchFeature(
+            {"input_features": input_features.cpu(), "length": length.cpu()}
+        )
+        if return_tensors is not None:
+            result = result.convert_to_tensors(return_tensors)
+        return result
+
+
+class CohereASRProcessor(ProcessorMixin):
+    """HF-compatible processor combining CohereASRFeatureExtractor and a
+    tokenizer."""
+
+    feature_extractor_class = "CohereASRFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    def __call__(
+        self,
+        text=None,
+        audio=None,
+        sampling_rate=None,
+        return_tensors=None,
+        **kwargs,
+    ):
+        if audio is not None:
+            result = self.feature_extractor(
+                audio,
+                sampling_rate=sampling_rate,
+                return_tensors=return_tensors,
+            )
+        else:
+            result = BatchFeature()
+
+        if text is not None:
+            text_inputs = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+            result["input_ids"] = text_inputs["input_ids"]
+
+        return result
+
+
+AutoFeatureExtractor.register("CohereASRFeatureExtractor", CohereASRFeatureExtractor)
+AutoProcessor.register("CohereASRProcessor", CohereASRProcessor)
--- a/vllm/transformers_utils/processors/fireredasr2.py
+++ b/vllm/transformers_utils/processors/fireredasr2.py
@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
        for speech in raw_speech:
            """
            We must multiply by 32768 here because FireRedASR2 loads audio data
-            using kaldiio.load_mat, while vLLM loads audio data using librosa.
+            using kaldiio.load_mat, while vLLM loads audio data using pyav.
            """
            speech = speech * 32768
            fbank = self.fbank(sampling_rate, speech)

--- a/vllm/transformers_utils/processors/glm4v.py
+++ b/vllm/transformers_utils/processors/glm4v.py
@@ -29,13 +29,8 @@ class GLM4VProcessor(ProcessorMixin):

    def __init__(
        self,
+        image_processor: GLM4VImageProcessorFast,
        tokenizer: PreTrainedTokenizer,
-        image_size: int,
-        image_processor: GLM4VImageProcessorFast | None = None,
    ) -> None:
-        self.tokenizer = tokenizer
-        if image_processor is None:
-            image_processor = GLM4VImageProcessorFast(
-                size={"width": image_size, "height": image_size}
-            )
        self.image_processor = image_processor
+        self.tokenizer = tokenizer
--- a/vllm/transformers_utils/processors/h2ovl.py
+++ b/vllm/transformers_utils/processors/h2ovl.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
+# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
+# --------------------------------------------------------
+# H2OVL-Mississippi
+# Copyright (c) 2024 H2O.AI
+# Licensed under Apache 2.0 License [see LICENSE for details]
+# --------------------------------------------------------
+import torch
+from PIL import Image
+
+from vllm.tokenizers.hf import HfTokenizer
+
+from .internvl import (
+    InternVLImageProcessor,
+    InternVLProcessor,
+    build_transform,
+    find_closest_aspect_ratio,
+    get_internvl_target_ratios,
+)
+
+
+def resolve_h2ovl_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_h2ovl_target_ratios(
+    min_num: int,
+    max_num: int,
+    *,
+    prior_aspect_ratio: tuple[int, int] | None,
+) -> list[tuple[int, int]]:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    # if prior_aspect_ratio is provided, filter the target ratios
+    if prior_aspect_ratio is not None:
+        target_ratios = [
+            ratio
+            for ratio in target_ratios
+            if prior_aspect_ratio[0] % ratio[0] != 0
+            and prior_aspect_ratio[1] % ratio[1] != 0
+        ]
+
+    return target_ratios
+
+
+# modified to include blocks generated in second pass
+def calculate_h2ovl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int, tuple[int, int]]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height, target_aspect_ratio
+
+
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+# refactored to handle prior_aspect_ratio
+def dynamic_preprocess_h2ovl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[list[Image.Image], tuple[int, int]]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    (
+        blocks,
+        target_width,
+        target_height,
+        target_aspect_ratio,
+    ) = calculate_h2ovl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images, target_aspect_ratio
+
+
+def _preprocess_image(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    prior_aspect_ratio: tuple[int, int] | None,
+) -> tuple[torch.Tensor, tuple[int, int]]:
+    target_ratios = get_h2ovl_target_ratios(
+        min_num,
+        max_num,
+        prior_aspect_ratio=prior_aspect_ratio,
+    )
+
+    transform = build_transform(input_size=input_size)
+    images, target_aspect_ratio = dynamic_preprocess_h2ovl(
+        image,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+        target_ratios=target_ratios,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values, target_aspect_ratio
+
+
+# refactored to use the _preprocess_image function
+def image_to_pixel_values_h2ovl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+    use_msac: bool,
+) -> torch.Tensor:
+    # when MSAC is turned on, we need to process the image twice
+    if use_msac:
+        # first pass
+        pixel_values1, aspect_ratio1 = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=1,
+            max_num=max_num,
+            use_thumbnail=True,
+            prior_aspect_ratio=None,
+        )
+        # second pass
+        pixel_values2, _ = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=3,
+            max_num=max_num,
+            use_thumbnail=True,
+            prior_aspect_ratio=aspect_ratio1,
+        )
+        # combine pixel values
+        pixel_values = torch.cat(
+            [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
+        )
+
+    else:
+        pixel_values, _ = _preprocess_image(
+            image,
+            input_size=input_size,
+            min_num=min_num,
+            max_num=max_num,
+            use_thumbnail=use_thumbnail,
+            prior_aspect_ratio=None,
+        )
+
+    return pixel_values
+
+
+class H2OVLImageProcessor(InternVLImageProcessor):
+    def __init__(
+        self,
+        image_size: int,
+        min_dynamic_patch: int,
+        max_dynamic_patch: int,
+        dynamic_image_size: bool,
+        use_thumbnail: bool,
+        use_msac: bool,
+    ) -> None:
+        super().__init__(
+            image_size=image_size,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        self.use_msac = use_msac
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+    ) -> tuple[int, int]:
+        if min_dynamic_patch is None:
+            min_dynamic_patch = self.min_dynamic_patch
+        if max_dynamic_patch is None:
+            max_dynamic_patch = self.max_dynamic_patch
+        if dynamic_image_size is None:
+            dynamic_image_size = self.dynamic_image_size
+        if use_thumbnail is None:
+            use_thumbnail = self.use_thumbnail
+
+        return resolve_h2ovl_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+    ) -> list[torch.Tensor]:
+        use_msac = self.use_msac if len(images) == 1 else False
+
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_h2ovl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+                use_msac=use_msac,
+            )
+            for image in images
+        ]
+
+
+class H2OVLProcessor(InternVLProcessor):
+    def __init__(
+        self,
+        image_processor: H2OVLImageProcessor,
+        tokenizer: HfTokenizer,
+        *,
+        image_seq_length: int,
+        start_image_token: str = "<img>",
+        end_image_token: str = "</img>",
+        ctx_image_token: str = "<IMG_CONTEXT>",
+    ) -> None:
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            image_seq_length=image_seq_length,
+            start_image_token=start_image_token,
+            end_image_token=end_image_token,
+            ctx_image_token=ctx_image_token,
+        )
+
+        self.image_processor: H2OVLImageProcessor
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+        prior_aspect_ratio: tuple[int, int] | None = None,
+        override_min_num: int | None = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.image_processor.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+        if override_min_num is not None:
+            min_num = override_min_num
+
+        return get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=prior_aspect_ratio,
+        )
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        use_msac: bool | None = None,
+    ) -> int:
+        image_processor = self.image_processor
+        use_msac = image_processor.use_msac if use_msac is None else use_msac
+
+        use_thumbnail = image_processor.use_thumbnail
+
+        if use_msac:
+            target_ratios_1 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                override_min_num=1,
+            )
+            num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=image_processor.image_size,
+                target_ratios=target_ratios_1,
+                use_thumbnail=True,
+            )
+
+            target_ratios_2 = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+                prior_aspect_ratio=aspect_ratio_1,
+                override_min_num=3,
+            )
+            num_patches_2, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=image_processor.image_size,
+                target_ratios=target_ratios_2,
+                use_thumbnail=True,
+            )
+
+            num_patches = num_patches_1 + num_patches_2 - 1
+        else:
+            target_ratios = self.resolve_target_ratios(
+                use_thumbnail=False,  # Applied in calculate_targets
+            )
+            num_patches, _, _, _ = calculate_h2ovl_targets(
+                orig_width=image_width,
+                orig_height=image_height,
+                image_size=image_processor.image_size,
+                target_ratios=target_ratios,
+                use_thumbnail=use_thumbnail,
+            )
+
+        return num_patches * self.image_seq_length