Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
......@@ -16,7 +16,7 @@
# limitations under the License.
"""Qwen3.5-MoE model configuration"""
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
from transformers.configuration_utils import PretrainedConfig
class Qwen3_5MoeTextConfig(PretrainedConfig):
......@@ -75,10 +75,6 @@ class Qwen3_5MoeTextConfig(PretrainedConfig):
eos_token_id=None,
**kwargs,
):
kwargs["ignore_keys_at_rope_validation"] = [
"mrope_section",
"mrope_interleaved",
]
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
......@@ -104,7 +100,18 @@ class Qwen3_5MoeTextConfig(PretrainedConfig):
else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types, self.num_hidden_layers)
if hasattr(self, "validate_layer_type"):
# Transformers v5
kwargs["ignore_keys_at_rope_validation"] = {
"mrope_section",
"mrope_interleaved",
}
self.validate_layer_type()
else:
# Transformers v4
from transformers.configuration_utils import layer_type_validation
layer_type_validation(self.layer_types, self.num_hidden_layers)
# linear attention part
self.linear_conv_kernel_dim = linear_conv_kernel_dim
......
......@@ -408,7 +408,6 @@ class Qwen3ASRConfig(PretrainedConfig):
support_languages=None,
**kwargs,
):
super().__init__(**kwargs)
if thinker_config is None:
thinker_config = {}
logger.info(
......@@ -417,6 +416,7 @@ class Qwen3ASRConfig(PretrainedConfig):
self.thinker_config = Qwen3ASRThinkerConfig(**thinker_config)
self.support_languages = support_languages
super().__init__(**kwargs)
def get_text_config(self, decoder=False) -> "PretrainedConfig":
"""
......
......@@ -16,7 +16,7 @@
# limitations under the License.
"""Qwen3-Next model configuration"""
from transformers.configuration_utils import PretrainedConfig, layer_type_validation
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
......@@ -253,7 +253,14 @@ class Qwen3NextConfig(PretrainedConfig):
"linear_attention" if bool((i + 1) % 4) else "full_attention"
for i in range(self.num_hidden_layers)
]
layer_type_validation(self.layer_types)
if hasattr(self, "validate_layer_type"):
# Transformers v5
self.validate_layer_type()
else:
# Transformers v4
from transformers.configuration_utils import layer_type_validation
layer_type_validation(self.layer_types)
# linear attention part
self.linear_conv_kernel_dim = linear_conv_kernel_dim
......
......@@ -47,6 +47,14 @@ class RadioConfig(PretrainedConfig):
teachers: A list of teacher model configurations. Each teacher configuration is
a dict with keys like "name" and some may have "use_summary".
cls_token_per_teacher: Whether to use a separate CLS token for each teacher.
video_temporal_patch_size: Number of consecutive video frames grouped into
a single tubelet for temporal compression. Default 1 (no compression).
When > 1, a dedicated video_embedder (3*T*P*P -> hidden) is created
alongside the image embedder (3*P*P -> hidden).
separate_video_embedder: When True and video_temporal_patch_size > 1, use a
dedicated video patch embedder (3*T*P*P -> hidden) separate from the
image embedder (3*P*P -> hidden). When False, a single embedder with
input size 3*T*P*P is used for both (images are duplicated T times).
"""
model_type = "radio"
......@@ -68,6 +76,8 @@ class RadioConfig(PretrainedConfig):
register_multiple: int | None = None,
teachers: list[dict[str, Any]] | None = None,
cls_token_per_teacher: bool = False,
video_temporal_patch_size: int = 1,
separate_video_embedder: bool = True,
**kwargs,
):
self.model_name = model_name
......@@ -95,4 +105,6 @@ class RadioConfig(PretrainedConfig):
self.register_multiple = register_multiple
self.teachers = teachers if teachers is not None else []
self.cls_token_per_teacher = cls_token_per_teacher
self.video_temporal_patch_size = video_temporal_patch_size
self.separate_video_embedder = separate_video_embedder
super().__init__(**kwargs)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .base import SpeculatorsConfig
__all__ = ["SpeculatorsConfig"]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from dataclasses import fields, is_dataclass
from typing import Any
from transformers import PretrainedConfig
......@@ -8,15 +9,29 @@ from transformers import PretrainedConfig
from vllm.transformers_utils.configs.speculators.algos import (
SUPPORTED_SPECULATORS_TYPES,
)
__all__ = ["SpeculatorsConfig"]
from vllm.transformers_utils.utils import without_trust_remote_code
class SpeculatorsConfig(PretrainedConfig):
model_type = "speculators"
def __init__(self, **kwargs):
# Transformers v4 - super().__init__ which sets all kwargs as attributes
if not is_dataclass(PretrainedConfig):
return super().__init__(**kwargs)
# Transformers v5 - super().__init__ performs some validation before
# setting all kwargs as attributes, so we set them first to be safe
pre_trained_config_fields = {f.name for f in fields(PretrainedConfig)}
super_kwargs = dict()
for key, value in kwargs.items():
if key == "model_type":
continue # model_type is set as a class variable, so skip it here
elif key in pre_trained_config_fields:
super_kwargs[key] = value
else:
setattr(self, key, value)
super().__init__(**super_kwargs)
@classmethod
def from_pretrained(
cls,
......
......@@ -43,7 +43,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
use `False`, but v0.5 and above use `True`.
"""
wrapped_model_config: transformers.PretrainedConfig
model_type = "ultravox"
audio_token = "<|audio|>"
is_composition = False
......@@ -75,6 +74,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
self.num_projector_layers = num_projector_layers
# N.B. May set the wrapped_model_config below.
self.wrapped_model_config: transformers.PretrainedConfig
self.text_model_id = text_model_id
if text_model_id is None:
text_config = text_config or {}
......
......@@ -228,7 +228,7 @@ class ModelArchConfigConvertorBase:
"pangu_ultra_moe_mtp",
"bailing_hybrid",
):
return self.hf_text_config.kv_lora_rank is not None
return getattr(self.hf_text_config, "kv_lora_rank", None) is not None
elif self.hf_text_config.model_type == "eagle":
# if the model is an EAGLE module, check for the
# underlying architecture
......@@ -241,7 +241,7 @@ class ModelArchConfigConvertorBase:
"deepseek_v32",
"deepseek_mtp",
)
and self.hf_text_config.kv_lora_rank is not None
and getattr(self.hf_text_config, "kv_lora_rank", None) is not None
)
return False
......@@ -300,6 +300,28 @@ class ModelArchConfigConvertorBase:
return model_arch_config
class CohereAsrModelArchConfigConvertor(ModelArchConfigConvertorBase):
def get_total_num_attention_heads(self) -> int:
return self.hf_text_config.transf_decoder["config_dict"]["num_attention_heads"]
def get_head_size(self) -> int:
hidden_size = self.hf_text_config.transf_decoder["config_dict"]["hidden_size"]
num_attention_heads = self.hf_text_config.transf_decoder["config_dict"][
"num_attention_heads"
]
return hidden_size // num_attention_heads
def get_total_num_kv_heads(self) -> int:
enc_num_kv_heads = self.hf_text_config.encoder["n_heads"]
dec_num_kv_heads = self.hf_text_config.transf_decoder["config_dict"][
"num_attention_heads"
]
assert enc_num_kv_heads == dec_num_kv_heads, (
"Encoder and decoder must have the same number of kv heads"
)
return enc_num_kv_heads
class MambaModelArchConfigConvertor(ModelArchConfigConvertorBase):
def get_head_size(self) -> int:
return 0
......@@ -425,6 +447,7 @@ class LongCatFlashMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
# hf_config.model_type -> convertor class
MODEL_ARCH_CONFIG_CONVERTORS = {
"cohere_asr": CohereAsrModelArchConfigConvertor,
"mamba": MambaModelArchConfigConvertor,
"falcon_mamba": MambaModelArchConfigConvertor,
"timm_wrapper": TerratorchModelArchConfigConvertor,
......
......@@ -12,36 +12,56 @@ import importlib
__all__ = [
"BagelProcessor",
"CohereASRProcessor",
"DeepseekVLV2Processor",
"FireRedASR2Processor",
"FunASRProcessor",
"GLM4VProcessor",
"H2OVLProcessor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",
"InternVLProcessor",
"IsaacProcessor",
"KimiAudioProcessor",
"KimiK25Processor",
"MistralCommonPixtralProcessor",
"MistralCommonVoxtralProcessor",
"NanoNemotronVLProcessor",
"NemotronVLProcessor",
"LlamaNemotronVLEmbedProcessor",
"NVLMProcessor",
"OvisProcessor",
"Ovis2_5Processor",
"QwenVLProcessor",
"Qwen3ASRProcessor",
"Step3VLProcessor",
]
_CLASS_TO_MODULE: dict[str, str] = {
"BagelProcessor": "vllm.transformers_utils.processors.bagel",
"CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
"FunASRProcessor": "vllm.transformers_utils.processors.funasr",
"GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
"H2OVLProcessor": "vllm.transformers_utils.processors.h2ovl",
"HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
"HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
"InternVLProcessor": "vllm.transformers_utils.processors.internvl",
"IsaacProcessor": "vllm.transformers_utils.processors.isaac",
"KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio",
"KimiK25Processor": "vllm.transformers_utils.processors.kimi_k25",
"MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
"MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
"NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
"NemotronVLProcessor": "vllm.transformers_utils.processors.nemotron_vl",
"LlamaNemotronVLEmbedProcessor": "vllm.transformers_utils.processors.nemotron_vl",
"NVLMProcessor": "vllm.transformers_utils.processors.nvlm_d",
"OvisProcessor": "vllm.transformers_utils.processors.ovis",
"Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
"QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
"Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
"Step3VLProcessor": "vllm.transformers_utils.processors.step3_vl",
}
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
import math
import random
import librosa
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from transformers import AutoFeatureExtractor, AutoProcessor, BatchFeature
from transformers.feature_extraction_sequence_utils import (
SequenceFeatureExtractor,
)
from transformers.processing_utils import ProcessorMixin
logger = logging.getLogger(__name__)
CONSTANT = 1e-5
INF_VAL = 10000.0
class FilterbankFeatures(nn.Module):
"""Featurizer that converts wavs to Mel Spectrograms.
See AudioToMelSpectrogramPreprocessor for args.
"""
window: torch.Tensor
fb: torch.Tensor
def __init__(
self,
sample_rate=16000,
n_window_size=320,
n_window_stride=160,
window="hann",
normalize="per_feature",
n_fft=None,
preemph=0.97,
nfilt=64,
lowfreq=0,
highfreq=None,
log=True,
log_zero_guard_type="add",
log_zero_guard_value=2**-24,
dither=CONSTANT,
pad_to=16,
max_duration=30,
frame_splicing=1,
exact_pad=False,
pad_value=0,
mag_power=2.0,
use_grads=False,
rng=None,
nb_augmentation_prob=0.0,
nb_max_freq=4000,
mel_norm="slaney",
stft_exact_pad=False,
stft_conv=False,
device="cpu",
):
super().__init__()
if stft_conv or stft_exact_pad:
logger.warning(
"Using torch_stft is deprecated and has been removed. "
"The values have been forcibly set to False for "
"FilterbankFeatures and AudioToMelSpectrogramPreprocessor. "
"Please set exact_pad to True as needed."
)
if exact_pad and n_window_stride % 2 == 1:
raise NotImplementedError(
f"{self} received exact_pad == True, but hop_size was odd. "
"If audio_length % hop_size == 0, the returned spectrogram "
"would not be of length audio_length // hop_size. "
"Please use an even hop_size."
)
self.log_zero_guard_value = log_zero_guard_value
if (
n_window_size is None
or n_window_stride is None
or not isinstance(n_window_size, int)
or not isinstance(n_window_stride, int)
or n_window_size <= 0
or n_window_stride <= 0
):
raise ValueError(
f"{self} got an invalid value for either n_window_size or "
f"n_window_stride. Both must be positive ints."
)
self.sample_rate = sample_rate
self.win_length = n_window_size
self.hop_length = n_window_stride
self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
self.stft_pad_amount = (
(self.n_fft - self.hop_length) // 2 if exact_pad else None
)
self.exact_pad = exact_pad
self.sample_rate = sample_rate
self.max_duration = max_duration
if exact_pad:
logger.info("STFT using exact pad")
torch_windows = {
"hann": torch.hann_window,
"hamming": torch.hamming_window,
"blackman": torch.blackman_window,
"bartlett": torch.bartlett_window,
"none": None,
}
window_fn = torch_windows.get(window)
window_tensor = (
window_fn(self.win_length, periodic=False) if window_fn else None
)
self.register_buffer("window", window_tensor)
self.normalize = normalize
self.log = log
self.dither = dither
self.frame_splicing = frame_splicing
self.nfilt = nfilt
self.preemph = preemph
self.pad_to = pad_to
highfreq = highfreq or sample_rate / 2
self.sample_rate = sample_rate
# disable pad min duration
# self.pad_min_duration = 1.0
self.pad_min_duration = 0.0
self.pad_direction = "both"
filterbanks = torch.tensor(
librosa.filters.mel(
sr=sample_rate,
n_fft=self.n_fft,
n_mels=nfilt,
fmin=lowfreq,
fmax=highfreq,
norm=mel_norm,
),
dtype=torch.float,
).unsqueeze(0)
self.register_buffer("fb", filterbanks)
# Calculate maximum sequence length
max_length = self.get_seq_len(
torch.tensor(max_duration * sample_rate, dtype=torch.float)
)
max_pad = pad_to - (max_length % pad_to) if pad_to > 0 else 0
self.max_length = max_length + max_pad
self.pad_value = pad_value
self.mag_power = mag_power
# We want to avoid taking the log of zero
# There are two options: either adding or clamping to a small value
if log_zero_guard_type not in ["add", "clamp"]:
raise ValueError(
f"{self} received {log_zero_guard_type} for the "
f"log_zero_guard_type parameter. It must be either 'add' or "
f"'clamp'."
)
self.use_grads = use_grads
if not use_grads:
self.forward = torch.no_grad()(self.forward)
self._rng = random.Random() if rng is None else rng
self.nb_augmentation_prob = nb_augmentation_prob
if self.nb_augmentation_prob > 0.0:
if nb_max_freq >= sample_rate / 2:
self.nb_augmentation_prob = 0.0
else:
self._nb_max_fft_bin = int((nb_max_freq / sample_rate) * n_fft)
# log_zero_guard_value is the the small we want to use, we support
# an actual number, or "tiny", or "eps"
self.log_zero_guard_type = log_zero_guard_type
assert self.window is not None
assert self.fb is not None
self.window = self.window.to(dtype=torch.bfloat16)
self.fb = self.fb.to(dtype=torch.bfloat16)
self.generator = torch.Generator(device=device)
self.generator.manual_seed(0)
@torch._dynamo.disable
def stft(self, x):
# disable autocast to get full range of stft values
with torch.amp.autocast(x.device.type, enabled=False):
return torch.stft(
x,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=self.win_length,
center=not self.exact_pad,
window=self.window.to(dtype=torch.float, device=x.device),
return_complex=True,
pad_mode="constant",
)
def log_zero_guard_value_fn(self, x):
if isinstance(self.log_zero_guard_value, str):
if self.log_zero_guard_value == "tiny":
return torch.finfo(x.dtype).tiny
elif self.log_zero_guard_value == "eps":
return torch.finfo(x.dtype).eps
else:
raise ValueError(
f"{self} received {self.log_zero_guard_value} for the "
f"log_zero_guard_type parameter. It must be either a "
f"number, 'tiny', or 'eps'"
)
else:
return self.log_zero_guard_value
def get_seq_len(self, seq_len):
# Assuming that center is True is stft_pad_amount = 0
pad_amount = (
self.stft_pad_amount * 2
if self.stft_pad_amount is not None
else self.n_fft // 2 * 2
)
seq_len = torch.floor_divide(
(seq_len + pad_amount - self.n_fft), self.hop_length
)
return seq_len.to(dtype=torch.long)
@property
def filter_banks(self):
return self.fb
def splice_frames(self, x, frame_splicing):
"""Stacks frames together across feature dim
input is batch_size, feature_dim, num_frames
output is batch_size, feature_dim*frame_splicing, num_frames
"""
seq = [x]
for n in range(1, frame_splicing):
seq.append(torch.cat([x[:, :, :n], x[:, :, n:]], dim=2))
return torch.cat(seq, dim=1)
def normalize_batch(self, x, seq_len, normalize_type):
x_mean = None
x_std = None
if normalize_type == "per_feature":
batch_size = x.shape[0]
max_time = x.shape[2]
# When doing stream capture to a graph, item() is not allowed
# because it calls cudaStreamSynchronize(). Therefore, we are
# sacrificing some error checking when running with cuda graphs.
# if (
# torch.cuda.is_available()
# and not torch.cuda.is_current_stream_capturing()
# and torch.any(seq_len == 1).item()
# ):
# raise ValueError(
# "normalize_batch with `per_feature` normalize_type "
# "received a tensor of length 1. This will result in "
# "torch.std() returning nan. Make sure your audio length "
# "has enough samples for a single feature (ex. at least "
# "`hop_length` for Mel Spectrograms)."
# )
time_steps = (
torch.arange(max_time, device=x.device)
.unsqueeze(0)
.expand(batch_size, max_time)
)
valid_mask = time_steps < seq_len.unsqueeze(1)
x_mean_numerator = torch.where(valid_mask.unsqueeze(1), x, 0.0).sum(axis=2)
x_mean_denominator = valid_mask.sum(axis=1)
x_mean = x_mean_numerator / x_mean_denominator.unsqueeze(1)
# Subtract 1 in the denominator to correct for the bias.
x_std = torch.sqrt(
torch.sum(
torch.where(valid_mask.unsqueeze(1), x - x_mean.unsqueeze(2), 0.0)
** 2,
axis=2,
)
/ (x_mean_denominator.unsqueeze(1) - 1.0)
)
x_std = x_std.masked_fill(
x_std.isnan(), 0.0
) # edge case: only 1 frame in denominator
# make sure x_std is not zero
x_std += CONSTANT
return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std
elif normalize_type == "all_features":
x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
for i in range(x.shape[0]):
x_mean[i] = x[i, :, : seq_len[i].item()].mean()
x_std[i] = x[i, :, : seq_len[i].item()].std()
# make sure x_std is not zero
x_std += CONSTANT
return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1), x_mean, x_std
elif "fixed_mean" in normalize_type and "fixed_std" in normalize_type:
x_mean = torch.tensor(normalize_type["fixed_mean"], device=x.device)
x_std = torch.tensor(normalize_type["fixed_std"], device=x.device)
return (
(x - x_mean.view(x.shape[0], x.shape[1]).unsqueeze(2))
/ x_std.view(x.shape[0], x.shape[1]).unsqueeze(2),
x_mean,
x_std,
)
else:
return x, x_mean, x_std
@torch.compile
def forward(self, x, seq_len, linear_spec=False):
if x.shape[1] < self.sample_rate * self.pad_min_duration:
pad_amount = int(self.sample_rate * self.pad_min_duration) - x.shape[1]
if self.pad_direction == "right":
x = F.pad(x, (0, pad_amount), value=self.pad_value)
elif self.pad_direction == "left":
x = F.pad(x, (pad_amount, 0), value=self.pad_value)
elif self.pad_direction == "both":
left_pad = pad_amount // 2
right_pad = pad_amount - left_pad
x = F.pad(x, (left_pad, right_pad), value=self.pad_value)
else:
raise ValueError(
f"{self} received an invalid pad_direction: {self.pad_direction}. "
f"It must be one of 'left', 'right', or 'both'."
)
seq_len = torch.tensor([x.shape[1]], dtype=torch.float, device=x.device)
seq_len_time = seq_len
seq_len_unfixed = self.get_seq_len(seq_len)
# fix for seq_len = 0 for streaming; if size was 0, it is always padded
# to 1, and normalizer fails
seq_len = torch.where(
seq_len == 0, torch.zeros_like(seq_len_unfixed), seq_len_unfixed
)
if self.stft_pad_amount is not None:
x = torch.nn.functional.pad(
x.unsqueeze(1), (self.stft_pad_amount, self.stft_pad_amount), "constant"
).squeeze(1)
# use dither for inference as well
if self.dither > 0:
x += self.dither * torch.randn(
x.shape, dtype=x.dtype, device=x.device, generator=self.generator
)
# do preemphasis
if self.preemph is not None:
timemask = torch.arange(x.shape[1], device=x.device).unsqueeze(
0
) < seq_len_time.unsqueeze(1)
x = torch.cat(
(x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1
)
x = x.masked_fill(~timemask, 0.0)
x = self.stft(x)
# torch stft returns complex tensor (of shape [B,N,T]); so convert to magnitude
# guard is needed for sqrt if grads are passed through
guard = 0 if not self.use_grads else CONSTANT
x = torch.view_as_real(x)
x = torch.sqrt(x.pow(2).sum(-1) + guard)
# get power spectrum
if self.mag_power != 1.0:
x = x.pow(self.mag_power)
# return plain spectrogram if required
if linear_spec:
return x, seq_len
# disable autocast, otherwise it might be automatically casted to fp16
# on fp16 compatible GPUs and get NaN values for input value of 65520
with torch.amp.autocast(x.device.type, enabled=False):
# dot with filterbank energies
x = torch.matmul(self.fb.to(x.dtype), x)
# log features if required
if self.log:
if self.log_zero_guard_type == "add":
x = torch.log(x + self.log_zero_guard_value_fn(x))
elif self.log_zero_guard_type == "clamp":
x = torch.log(torch.clamp(x, min=self.log_zero_guard_value_fn(x)))
else:
raise ValueError("log_zero_guard_type was not understood")
# frame splicing if required
if self.frame_splicing > 1:
x = self.splice_frames(x, self.frame_splicing)
# normalize if required
if self.normalize:
x, _, _ = self.normalize_batch(x, seq_len, normalize_type=self.normalize)
# mask to zero any values beyond seq_len in batch, pad to multiple of
# `pad_to` (for efficiency)
max_len = x.size(-1)
mask = torch.arange(max_len, device=x.device)
mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1)
x = x.masked_fill(
mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value
)
del mask
pad_to = self.pad_to
if pad_to == "max":
x = nn.functional.pad(
x, (0, self.max_length - x.size(-1)), value=self.pad_value
)
elif pad_to > 0:
pad_amt = x.size(-1) % pad_to
if pad_amt != 0:
x = nn.functional.pad(x, (0, pad_to - pad_amt), value=self.pad_value)
return x, seq_len
class CohereASRFeatureExtractor(SequenceFeatureExtractor):
"""HF-compatible feature extractor wrapping FilterbankFeatures."""
model_input_names = ["input_features"]
def __init__(
self,
feature_size=64,
sampling_rate=16000,
padding_value=0.0,
max_duration=30,
n_window_size=320,
n_window_stride=160,
window="hann",
normalize="per_feature",
n_fft=None,
preemph=0.97,
lowfreq=0,
highfreq=None,
log=True,
log_zero_guard_type="add",
log_zero_guard_value=2**-24,
dither=CONSTANT,
pad_to=16,
frame_splicing=1,
exact_pad=False,
mag_power=2.0,
nb_augmentation_prob=0.0,
nb_max_freq=4000,
mel_norm="slaney",
stft_exact_pad=False,
stft_conv=False,
device="cpu",
**kwargs,
):
super().__init__(
feature_size=feature_size,
sampling_rate=sampling_rate,
padding_value=padding_value,
**kwargs,
)
self.max_duration = max_duration
self.hop_length = n_window_stride
self._device = torch.device(device)
self._fb_config = dict(
sample_rate=sampling_rate,
n_window_size=n_window_size,
n_window_stride=n_window_stride,
window=window,
normalize=normalize,
n_fft=n_fft,
preemph=preemph,
nfilt=feature_size,
lowfreq=lowfreq,
highfreq=highfreq,
log=log,
log_zero_guard_type=log_zero_guard_type,
log_zero_guard_value=log_zero_guard_value,
dither=dither,
pad_to=pad_to,
max_duration=max_duration,
frame_splicing=frame_splicing,
exact_pad=exact_pad,
pad_value=padding_value,
mag_power=mag_power,
nb_augmentation_prob=nb_augmentation_prob,
nb_max_freq=nb_max_freq,
mel_norm=mel_norm,
stft_exact_pad=stft_exact_pad,
stft_conv=stft_conv,
device=device,
)
self._filterbank: FilterbankFeatures | None = None
@property
def filterbank(self) -> FilterbankFeatures:
if self._filterbank is None:
fb = FilterbankFeatures(**self._fb_config)
fb.eval()
self._filterbank = fb.to(self._device)
return self._filterbank
def get_seq_len(self, seq_len):
return self.filterbank.get_seq_len(seq_len)
def __call__(
self,
raw_speech,
sampling_rate=None,
return_tensors=None,
**kwargs,
) -> BatchFeature:
if isinstance(raw_speech, np.ndarray):
raw_speech = [raw_speech]
seq_len = torch.tensor([s.shape[0] for s in raw_speech])
max_len = max(s.shape[0] for s in raw_speech)
padded = np.zeros((len(raw_speech), max_len), dtype=np.float32)
for i, s in enumerate(raw_speech):
padded[i, : s.shape[0]] = s
audio_tensor = torch.from_numpy(padded).to(self._device)
seq_len = seq_len.to(self._device)
with torch.no_grad():
input_features, length = self.filterbank(audio_tensor, seq_len)
result = BatchFeature(
{"input_features": input_features.cpu(), "length": length.cpu()}
)
if return_tensors is not None:
result = result.convert_to_tensors(return_tensors)
return result
class CohereASRProcessor(ProcessorMixin):
"""HF-compatible processor combining CohereASRFeatureExtractor and a
tokenizer."""
feature_extractor_class = "CohereASRFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)
def __call__(
self,
text=None,
audio=None,
sampling_rate=None,
return_tensors=None,
**kwargs,
):
if audio is not None:
result = self.feature_extractor(
audio,
sampling_rate=sampling_rate,
return_tensors=return_tensors,
)
else:
result = BatchFeature()
if text is not None:
text_inputs = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
result["input_ids"] = text_inputs["input_ids"]
return result
AutoFeatureExtractor.register("CohereASRFeatureExtractor", CohereASRFeatureExtractor)
AutoProcessor.register("CohereASRProcessor", CohereASRProcessor)
......@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
for speech in raw_speech:
"""
We must multiply by 32768 here because FireRedASR2 loads audio data
using kaldiio.load_mat, while vLLM loads audio data using librosa.
using kaldiio.load_mat, while vLLM loads audio data using pyav.
"""
speech = speech * 32768
fbank = self.fbank(sampling_rate, speech)
......
......@@ -29,13 +29,8 @@ class GLM4VProcessor(ProcessorMixin):
def __init__(
self,
image_processor: GLM4VImageProcessorFast,
tokenizer: PreTrainedTokenizer,
image_size: int,
image_processor: GLM4VImageProcessorFast | None = None,
) -> None:
self.tokenizer = tokenizer
if image_processor is None:
image_processor = GLM4VImageProcessorFast(
size={"width": image_size, "height": image_size}
)
self.image_processor = image_processor
self.tokenizer = tokenizer
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
# --------------------------------------------------------
# H2OVL-Mississippi
# Copyright (c) 2024 H2O.AI
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
import torch
from PIL import Image
from vllm.tokenizers.hf import HfTokenizer
from .internvl import (
InternVLImageProcessor,
InternVLProcessor,
build_transform,
find_closest_aspect_ratio,
get_internvl_target_ratios,
)
def resolve_h2ovl_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_h2ovl_target_ratios(
min_num: int,
max_num: int,
*,
prior_aspect_ratio: tuple[int, int] | None,
) -> list[tuple[int, int]]:
target_ratios = get_internvl_target_ratios(min_num, max_num)
# if prior_aspect_ratio is provided, filter the target ratios
if prior_aspect_ratio is not None:
target_ratios = [
ratio
for ratio in target_ratios
if prior_aspect_ratio[0] % ratio[0] != 0
and prior_aspect_ratio[1] % ratio[1] != 0
]
return target_ratios
# modified to include blocks generated in second pass
def calculate_h2ovl_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int, tuple[int, int]]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height, target_aspect_ratio
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
# refactored to handle prior_aspect_ratio
def dynamic_preprocess_h2ovl(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[list[Image.Image], tuple[int, int]]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
(
blocks,
target_width,
target_height,
target_aspect_ratio,
) = calculate_h2ovl_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images, target_aspect_ratio
def _preprocess_image(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
prior_aspect_ratio: tuple[int, int] | None,
) -> tuple[torch.Tensor, tuple[int, int]]:
target_ratios = get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=prior_aspect_ratio,
)
transform = build_transform(input_size=input_size)
images, target_aspect_ratio = dynamic_preprocess_h2ovl(
image,
image_size=input_size,
use_thumbnail=use_thumbnail,
target_ratios=target_ratios,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values, target_aspect_ratio
# refactored to use the _preprocess_image function
def image_to_pixel_values_h2ovl(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
use_msac: bool,
) -> torch.Tensor:
# when MSAC is turned on, we need to process the image twice
if use_msac:
# first pass
pixel_values1, aspect_ratio1 = _preprocess_image(
image,
input_size=input_size,
min_num=1,
max_num=max_num,
use_thumbnail=True,
prior_aspect_ratio=None,
)
# second pass
pixel_values2, _ = _preprocess_image(
image,
input_size=input_size,
min_num=3,
max_num=max_num,
use_thumbnail=True,
prior_aspect_ratio=aspect_ratio1,
)
# combine pixel values
pixel_values = torch.cat(
[pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
)
else:
pixel_values, _ = _preprocess_image(
image,
input_size=input_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=use_thumbnail,
prior_aspect_ratio=None,
)
return pixel_values
class H2OVLImageProcessor(InternVLImageProcessor):
def __init__(
self,
image_size: int,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
use_msac: bool,
) -> None:
super().__init__(
image_size=image_size,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
self.use_msac = use_msac
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
if min_dynamic_patch is None:
min_dynamic_patch = self.min_dynamic_patch
if max_dynamic_patch is None:
max_dynamic_patch = self.max_dynamic_patch
if dynamic_image_size is None:
dynamic_image_size = self.dynamic_image_size
if use_thumbnail is None:
use_thumbnail = self.use_thumbnail
return resolve_h2ovl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_h2ovl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
use_msac=use_msac,
)
for image in images
]
class H2OVLProcessor(InternVLProcessor):
def __init__(
self,
image_processor: H2OVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
self.image_processor: H2OVLImageProcessor
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
prior_aspect_ratio: tuple[int, int] | None = None,
override_min_num: int | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.image_processor.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
if override_min_num is not None:
min_num = override_min_num
return get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=prior_aspect_ratio,
)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
use_msac: bool | None = None,
) -> int:
image_processor = self.image_processor
use_msac = image_processor.use_msac if use_msac is None else use_msac
use_thumbnail = image_processor.use_thumbnail
if use_msac:
target_ratios_1 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
override_min_num=1,
)
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios_1,
use_thumbnail=True,
)
target_ratios_2 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
prior_aspect_ratio=aspect_ratio_1,
override_min_num=3,
)
num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios_2,
use_thumbnail=True,
)
num_patches = num_patches_1 + num_patches_2 - 1
else:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=use_thumbnail,
)
return num_patches * self.image_seq_length
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment