Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0da93439
Commit
0da93439
authored
Mar 26, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori
parents
25f2f756
298e5108
Changes
613
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
1067 additions
and
23 deletions
+1067
-23
vllm/transformers_utils/configs/qwen3_5_moe.py
vllm/transformers_utils/configs/qwen3_5_moe.py
+13
-6
vllm/transformers_utils/configs/qwen3_asr.py
vllm/transformers_utils/configs/qwen3_asr.py
+1
-1
vllm/transformers_utils/configs/qwen3_next.py
vllm/transformers_utils/configs/qwen3_next.py
+9
-2
vllm/transformers_utils/configs/radio.py
vllm/transformers_utils/configs/radio.py
+12
-0
vllm/transformers_utils/configs/speculators/__init__.py
vllm/transformers_utils/configs/speculators/__init__.py
+3
-0
vllm/transformers_utils/configs/speculators/base.py
vllm/transformers_utils/configs/speculators/base.py
+18
-3
vllm/transformers_utils/configs/ultravox.py
vllm/transformers_utils/configs/ultravox.py
+1
-1
vllm/transformers_utils/model_arch_config_convertor.py
vllm/transformers_utils/model_arch_config_convertor.py
+25
-2
vllm/transformers_utils/processors/__init__.py
vllm/transformers_utils/processors/__init__.py
+20
-0
vllm/transformers_utils/processors/cohere_asr.py
vllm/transformers_utils/processors/cohere_asr.py
+575
-0
vllm/transformers_utils/processors/fireredasr2.py
vllm/transformers_utils/processors/fireredasr2.py
+1
-1
vllm/transformers_utils/processors/glm4v.py
vllm/transformers_utils/processors/glm4v.py
+2
-7
vllm/transformers_utils/processors/h2ovl.py
vllm/transformers_utils/processors/h2ovl.py
+387
-0
No files found.
Too many changes to show.
To preserve performance only
613 of 613+
files are displayed.
Plain diff
Email patch
vllm/transformers_utils/configs/qwen3_5_moe.py
View file @
0da93439
...
...
@@ -16,7 +16,7 @@
# limitations under the License.
"""Qwen3.5-MoE model configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
,
layer_type_validation
from
transformers.configuration_utils
import
PretrainedConfig
class
Qwen3_5MoeTextConfig
(
PretrainedConfig
):
...
...
@@ -75,10 +75,6 @@ class Qwen3_5MoeTextConfig(PretrainedConfig):
eos_token_id
=
None
,
**
kwargs
,
):
kwargs
[
"ignore_keys_at_rope_validation"
]
=
[
"mrope_section"
,
"mrope_interleaved"
,
]
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
...
...
@@ -104,7 +100,18 @@ class Qwen3_5MoeTextConfig(PretrainedConfig):
else
"full_attention"
for
i
in
range
(
self
.
num_hidden_layers
)
]
layer_type_validation
(
self
.
layer_types
,
self
.
num_hidden_layers
)
if
hasattr
(
self
,
"validate_layer_type"
):
# Transformers v5
kwargs
[
"ignore_keys_at_rope_validation"
]
=
{
"mrope_section"
,
"mrope_interleaved"
,
}
self
.
validate_layer_type
()
else
:
# Transformers v4
from
transformers.configuration_utils
import
layer_type_validation
layer_type_validation
(
self
.
layer_types
,
self
.
num_hidden_layers
)
# linear attention part
self
.
linear_conv_kernel_dim
=
linear_conv_kernel_dim
...
...
vllm/transformers_utils/configs/qwen3_asr.py
View file @
0da93439
...
...
@@ -408,7 +408,6 @@ class Qwen3ASRConfig(PretrainedConfig):
support_languages
=
None
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
if
thinker_config
is
None
:
thinker_config
=
{}
logger
.
info
(
...
...
@@ -417,6 +416,7 @@ class Qwen3ASRConfig(PretrainedConfig):
self
.
thinker_config
=
Qwen3ASRThinkerConfig
(
**
thinker_config
)
self
.
support_languages
=
support_languages
super
().
__init__
(
**
kwargs
)
def
get_text_config
(
self
,
decoder
=
False
)
->
"PretrainedConfig"
:
"""
...
...
vllm/transformers_utils/configs/qwen3_next.py
View file @
0da93439
...
...
@@ -16,7 +16,7 @@
# limitations under the License.
"""Qwen3-Next model configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
,
layer_type_validation
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
...
...
@@ -253,7 +253,14 @@ class Qwen3NextConfig(PretrainedConfig):
"linear_attention"
if
bool
((
i
+
1
)
%
4
)
else
"full_attention"
for
i
in
range
(
self
.
num_hidden_layers
)
]
layer_type_validation
(
self
.
layer_types
)
if
hasattr
(
self
,
"validate_layer_type"
):
# Transformers v5
self
.
validate_layer_type
()
else
:
# Transformers v4
from
transformers.configuration_utils
import
layer_type_validation
layer_type_validation
(
self
.
layer_types
)
# linear attention part
self
.
linear_conv_kernel_dim
=
linear_conv_kernel_dim
...
...
vllm/transformers_utils/configs/radio.py
View file @
0da93439
...
...
@@ -47,6 +47,14 @@ class RadioConfig(PretrainedConfig):
teachers: A list of teacher model configurations. Each teacher configuration is
a dict with keys like "name" and some may have "use_summary".
cls_token_per_teacher: Whether to use a separate CLS token for each teacher.
video_temporal_patch_size: Number of consecutive video frames grouped into
a single tubelet for temporal compression. Default 1 (no compression).
When > 1, a dedicated video_embedder (3*T*P*P -> hidden) is created
alongside the image embedder (3*P*P -> hidden).
separate_video_embedder: When True and video_temporal_patch_size > 1, use a
dedicated video patch embedder (3*T*P*P -> hidden) separate from the
image embedder (3*P*P -> hidden). When False, a single embedder with
input size 3*T*P*P is used for both (images are duplicated T times).
"""
model_type
=
"radio"
...
...
@@ -68,6 +76,8 @@ class RadioConfig(PretrainedConfig):
register_multiple
:
int
|
None
=
None
,
teachers
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
cls_token_per_teacher
:
bool
=
False
,
video_temporal_patch_size
:
int
=
1
,
separate_video_embedder
:
bool
=
True
,
**
kwargs
,
):
self
.
model_name
=
model_name
...
...
@@ -95,4 +105,6 @@ class RadioConfig(PretrainedConfig):
self
.
register_multiple
=
register_multiple
self
.
teachers
=
teachers
if
teachers
is
not
None
else
[]
self
.
cls_token_per_teacher
=
cls_token_per_teacher
self
.
video_temporal_patch_size
=
video_temporal_patch_size
self
.
separate_video_embedder
=
separate_video_embedder
super
().
__init__
(
**
kwargs
)
vllm/transformers_utils/configs/speculators/__init__.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
.base
import
SpeculatorsConfig
__all__
=
[
"SpeculatorsConfig"
]
vllm/transformers_utils/configs/speculators/base.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
dataclasses
import
fields
,
is_dataclass
from
typing
import
Any
from
transformers
import
PretrainedConfig
...
...
@@ -8,15 +9,29 @@ from transformers import PretrainedConfig
from
vllm.transformers_utils.configs.speculators.algos
import
(
SUPPORTED_SPECULATORS_TYPES
,
)
__all__
=
[
"SpeculatorsConfig"
]
from
vllm.transformers_utils.utils
import
without_trust_remote_code
class
SpeculatorsConfig
(
PretrainedConfig
):
model_type
=
"speculators"
def
__init__
(
self
,
**
kwargs
):
# Transformers v4 - super().__init__ which sets all kwargs as attributes
if
not
is_dataclass
(
PretrainedConfig
):
return
super
().
__init__
(
**
kwargs
)
# Transformers v5 - super().__init__ performs some validation before
# setting all kwargs as attributes, so we set them first to be safe
pre_trained_config_fields
=
{
f
.
name
for
f
in
fields
(
PretrainedConfig
)}
super_kwargs
=
dict
()
for
key
,
value
in
kwargs
.
items
():
if
key
==
"model_type"
:
continue
# model_type is set as a class variable, so skip it here
elif
key
in
pre_trained_config_fields
:
super_kwargs
[
key
]
=
value
else
:
setattr
(
self
,
key
,
value
)
super
().
__init__
(
**
super_kwargs
)
@
classmethod
def
from_pretrained
(
cls
,
...
...
vllm/transformers_utils/configs/ultravox.py
View file @
0da93439
...
...
@@ -43,7 +43,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
use `False`, but v0.5 and above use `True`.
"""
wrapped_model_config
:
transformers
.
PretrainedConfig
model_type
=
"ultravox"
audio_token
=
"<|audio|>"
is_composition
=
False
...
...
@@ -75,6 +74,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
self
.
num_projector_layers
=
num_projector_layers
# N.B. May set the wrapped_model_config below.
self
.
wrapped_model_config
:
transformers
.
PretrainedConfig
self
.
text_model_id
=
text_model_id
if
text_model_id
is
None
:
text_config
=
text_config
or
{}
...
...
vllm/transformers_utils/model_arch_config_convertor.py
View file @
0da93439
...
...
@@ -228,7 +228,7 @@ class ModelArchConfigConvertorBase:
"pangu_ultra_moe_mtp"
,
"bailing_hybrid"
,
):
return
self
.
hf_text_config
.
kv_lora_rank
is
not
None
return
getattr
(
self
.
hf_text_config
,
"
kv_lora_rank
"
,
None
)
is
not
None
elif
self
.
hf_text_config
.
model_type
==
"eagle"
:
# if the model is an EAGLE module, check for the
# underlying architecture
...
...
@@ -241,7 +241,7 @@ class ModelArchConfigConvertorBase:
"deepseek_v32"
,
"deepseek_mtp"
,
)
and
self
.
hf_text_config
.
kv_lora_rank
is
not
None
and
getattr
(
self
.
hf_text_config
,
"
kv_lora_rank
"
,
None
)
is
not
None
)
return
False
...
...
@@ -300,6 +300,28 @@ class ModelArchConfigConvertorBase:
return
model_arch_config
class
CohereAsrModelArchConfigConvertor
(
ModelArchConfigConvertorBase
):
def
get_total_num_attention_heads
(
self
)
->
int
:
return
self
.
hf_text_config
.
transf_decoder
[
"config_dict"
][
"num_attention_heads"
]
def
get_head_size
(
self
)
->
int
:
hidden_size
=
self
.
hf_text_config
.
transf_decoder
[
"config_dict"
][
"hidden_size"
]
num_attention_heads
=
self
.
hf_text_config
.
transf_decoder
[
"config_dict"
][
"num_attention_heads"
]
return
hidden_size
//
num_attention_heads
def
get_total_num_kv_heads
(
self
)
->
int
:
enc_num_kv_heads
=
self
.
hf_text_config
.
encoder
[
"n_heads"
]
dec_num_kv_heads
=
self
.
hf_text_config
.
transf_decoder
[
"config_dict"
][
"num_attention_heads"
]
assert
enc_num_kv_heads
==
dec_num_kv_heads
,
(
"Encoder and decoder must have the same number of kv heads"
)
return
enc_num_kv_heads
class
MambaModelArchConfigConvertor
(
ModelArchConfigConvertorBase
):
def
get_head_size
(
self
)
->
int
:
return
0
...
...
@@ -425,6 +447,7 @@ class LongCatFlashMTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
# hf_config.model_type -> convertor class
MODEL_ARCH_CONFIG_CONVERTORS
=
{
"cohere_asr"
:
CohereAsrModelArchConfigConvertor
,
"mamba"
:
MambaModelArchConfigConvertor
,
"falcon_mamba"
:
MambaModelArchConfigConvertor
,
"timm_wrapper"
:
TerratorchModelArchConfigConvertor
,
...
...
vllm/transformers_utils/processors/__init__.py
View file @
0da93439
...
...
@@ -12,36 +12,56 @@ import importlib
__all__
=
[
"BagelProcessor"
,
"CohereASRProcessor"
,
"DeepseekVLV2Processor"
,
"FireRedASR2Processor"
,
"FunASRProcessor"
,
"GLM4VProcessor"
,
"H2OVLProcessor"
,
"HunYuanVLProcessor"
,
"HunYuanVLImageProcessor"
,
"InternVLProcessor"
,
"IsaacProcessor"
,
"KimiAudioProcessor"
,
"KimiK25Processor"
,
"MistralCommonPixtralProcessor"
,
"MistralCommonVoxtralProcessor"
,
"NanoNemotronVLProcessor"
,
"NemotronVLProcessor"
,
"LlamaNemotronVLEmbedProcessor"
,
"NVLMProcessor"
,
"OvisProcessor"
,
"Ovis2_5Processor"
,
"QwenVLProcessor"
,
"Qwen3ASRProcessor"
,
"Step3VLProcessor"
,
]
_CLASS_TO_MODULE
:
dict
[
str
,
str
]
=
{
"BagelProcessor"
:
"vllm.transformers_utils.processors.bagel"
,
"CohereASRProcessor"
:
"vllm.transformers_utils.processors.cohere_asr"
,
"DeepseekVLV2Processor"
:
"vllm.transformers_utils.processors.deepseek_vl2"
,
"FireRedASR2Processor"
:
"vllm.transformers_utils.processors.fireredasr2"
,
"FunASRProcessor"
:
"vllm.transformers_utils.processors.funasr"
,
"GLM4VProcessor"
:
"vllm.transformers_utils.processors.glm4v"
,
"H2OVLProcessor"
:
"vllm.transformers_utils.processors.h2ovl"
,
"HunYuanVLProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl"
,
"HunYuanVLImageProcessor"
:
"vllm.transformers_utils.processors.hunyuan_vl_image"
,
"InternVLProcessor"
:
"vllm.transformers_utils.processors.internvl"
,
"IsaacProcessor"
:
"vllm.transformers_utils.processors.isaac"
,
"KimiAudioProcessor"
:
"vllm.transformers_utils.processors.kimi_audio"
,
"KimiK25Processor"
:
"vllm.transformers_utils.processors.kimi_k25"
,
"MistralCommonPixtralProcessor"
:
"vllm.transformers_utils.processors.pixtral"
,
"MistralCommonVoxtralProcessor"
:
"vllm.transformers_utils.processors.voxtral"
,
"NanoNemotronVLProcessor"
:
"vllm.transformers_utils.processors.nano_nemotron_vl"
,
"NemotronVLProcessor"
:
"vllm.transformers_utils.processors.nemotron_vl"
,
"LlamaNemotronVLEmbedProcessor"
:
"vllm.transformers_utils.processors.nemotron_vl"
,
"NVLMProcessor"
:
"vllm.transformers_utils.processors.nvlm_d"
,
"OvisProcessor"
:
"vllm.transformers_utils.processors.ovis"
,
"Ovis2_5Processor"
:
"vllm.transformers_utils.processors.ovis2_5"
,
"QwenVLProcessor"
:
"vllm.transformers_utils.processors.qwen_vl"
,
"Qwen3ASRProcessor"
:
"vllm.transformers_utils.processors.qwen3_asr"
,
"Step3VLProcessor"
:
"vllm.transformers_utils.processors.step3_vl"
,
}
...
...
vllm/transformers_utils/processors/cohere_asr.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
logging
import
math
import
random
import
librosa
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
transformers
import
AutoFeatureExtractor
,
AutoProcessor
,
BatchFeature
from
transformers.feature_extraction_sequence_utils
import
(
SequenceFeatureExtractor
,
)
from
transformers.processing_utils
import
ProcessorMixin
logger
=
logging
.
getLogger
(
__name__
)
CONSTANT
=
1e-5
INF_VAL
=
10000.0
class
FilterbankFeatures
(
nn
.
Module
):
"""Featurizer that converts wavs to Mel Spectrograms.
See AudioToMelSpectrogramPreprocessor for args.
"""
window
:
torch
.
Tensor
fb
:
torch
.
Tensor
def
__init__
(
self
,
sample_rate
=
16000
,
n_window_size
=
320
,
n_window_stride
=
160
,
window
=
"hann"
,
normalize
=
"per_feature"
,
n_fft
=
None
,
preemph
=
0.97
,
nfilt
=
64
,
lowfreq
=
0
,
highfreq
=
None
,
log
=
True
,
log_zero_guard_type
=
"add"
,
log_zero_guard_value
=
2
**-
24
,
dither
=
CONSTANT
,
pad_to
=
16
,
max_duration
=
30
,
frame_splicing
=
1
,
exact_pad
=
False
,
pad_value
=
0
,
mag_power
=
2.0
,
use_grads
=
False
,
rng
=
None
,
nb_augmentation_prob
=
0.0
,
nb_max_freq
=
4000
,
mel_norm
=
"slaney"
,
stft_exact_pad
=
False
,
stft_conv
=
False
,
device
=
"cpu"
,
):
super
().
__init__
()
if
stft_conv
or
stft_exact_pad
:
logger
.
warning
(
"Using torch_stft is deprecated and has been removed. "
"The values have been forcibly set to False for "
"FilterbankFeatures and AudioToMelSpectrogramPreprocessor. "
"Please set exact_pad to True as needed."
)
if
exact_pad
and
n_window_stride
%
2
==
1
:
raise
NotImplementedError
(
f
"
{
self
}
received exact_pad == True, but hop_size was odd. "
"If audio_length % hop_size == 0, the returned spectrogram "
"would not be of length audio_length // hop_size. "
"Please use an even hop_size."
)
self
.
log_zero_guard_value
=
log_zero_guard_value
if
(
n_window_size
is
None
or
n_window_stride
is
None
or
not
isinstance
(
n_window_size
,
int
)
or
not
isinstance
(
n_window_stride
,
int
)
or
n_window_size
<=
0
or
n_window_stride
<=
0
):
raise
ValueError
(
f
"
{
self
}
got an invalid value for either n_window_size or "
f
"n_window_stride. Both must be positive ints."
)
self
.
sample_rate
=
sample_rate
self
.
win_length
=
n_window_size
self
.
hop_length
=
n_window_stride
self
.
n_fft
=
n_fft
or
2
**
math
.
ceil
(
math
.
log2
(
self
.
win_length
))
self
.
stft_pad_amount
=
(
(
self
.
n_fft
-
self
.
hop_length
)
//
2
if
exact_pad
else
None
)
self
.
exact_pad
=
exact_pad
self
.
sample_rate
=
sample_rate
self
.
max_duration
=
max_duration
if
exact_pad
:
logger
.
info
(
"STFT using exact pad"
)
torch_windows
=
{
"hann"
:
torch
.
hann_window
,
"hamming"
:
torch
.
hamming_window
,
"blackman"
:
torch
.
blackman_window
,
"bartlett"
:
torch
.
bartlett_window
,
"none"
:
None
,
}
window_fn
=
torch_windows
.
get
(
window
)
window_tensor
=
(
window_fn
(
self
.
win_length
,
periodic
=
False
)
if
window_fn
else
None
)
self
.
register_buffer
(
"window"
,
window_tensor
)
self
.
normalize
=
normalize
self
.
log
=
log
self
.
dither
=
dither
self
.
frame_splicing
=
frame_splicing
self
.
nfilt
=
nfilt
self
.
preemph
=
preemph
self
.
pad_to
=
pad_to
highfreq
=
highfreq
or
sample_rate
/
2
self
.
sample_rate
=
sample_rate
# disable pad min duration
# self.pad_min_duration = 1.0
self
.
pad_min_duration
=
0.0
self
.
pad_direction
=
"both"
filterbanks
=
torch
.
tensor
(
librosa
.
filters
.
mel
(
sr
=
sample_rate
,
n_fft
=
self
.
n_fft
,
n_mels
=
nfilt
,
fmin
=
lowfreq
,
fmax
=
highfreq
,
norm
=
mel_norm
,
),
dtype
=
torch
.
float
,
).
unsqueeze
(
0
)
self
.
register_buffer
(
"fb"
,
filterbanks
)
# Calculate maximum sequence length
max_length
=
self
.
get_seq_len
(
torch
.
tensor
(
max_duration
*
sample_rate
,
dtype
=
torch
.
float
)
)
max_pad
=
pad_to
-
(
max_length
%
pad_to
)
if
pad_to
>
0
else
0
self
.
max_length
=
max_length
+
max_pad
self
.
pad_value
=
pad_value
self
.
mag_power
=
mag_power
# We want to avoid taking the log of zero
# There are two options: either adding or clamping to a small value
if
log_zero_guard_type
not
in
[
"add"
,
"clamp"
]:
raise
ValueError
(
f
"
{
self
}
received
{
log_zero_guard_type
}
for the "
f
"log_zero_guard_type parameter. It must be either 'add' or "
f
"'clamp'."
)
self
.
use_grads
=
use_grads
if
not
use_grads
:
self
.
forward
=
torch
.
no_grad
()(
self
.
forward
)
self
.
_rng
=
random
.
Random
()
if
rng
is
None
else
rng
self
.
nb_augmentation_prob
=
nb_augmentation_prob
if
self
.
nb_augmentation_prob
>
0.0
:
if
nb_max_freq
>=
sample_rate
/
2
:
self
.
nb_augmentation_prob
=
0.0
else
:
self
.
_nb_max_fft_bin
=
int
((
nb_max_freq
/
sample_rate
)
*
n_fft
)
# log_zero_guard_value is the the small we want to use, we support
# an actual number, or "tiny", or "eps"
self
.
log_zero_guard_type
=
log_zero_guard_type
assert
self
.
window
is
not
None
assert
self
.
fb
is
not
None
self
.
window
=
self
.
window
.
to
(
dtype
=
torch
.
bfloat16
)
self
.
fb
=
self
.
fb
.
to
(
dtype
=
torch
.
bfloat16
)
self
.
generator
=
torch
.
Generator
(
device
=
device
)
self
.
generator
.
manual_seed
(
0
)
@
torch
.
_dynamo
.
disable
def
stft
(
self
,
x
):
# disable autocast to get full range of stft values
with
torch
.
amp
.
autocast
(
x
.
device
.
type
,
enabled
=
False
):
return
torch
.
stft
(
x
,
n_fft
=
self
.
n_fft
,
hop_length
=
self
.
hop_length
,
win_length
=
self
.
win_length
,
center
=
not
self
.
exact_pad
,
window
=
self
.
window
.
to
(
dtype
=
torch
.
float
,
device
=
x
.
device
),
return_complex
=
True
,
pad_mode
=
"constant"
,
)
def
log_zero_guard_value_fn
(
self
,
x
):
if
isinstance
(
self
.
log_zero_guard_value
,
str
):
if
self
.
log_zero_guard_value
==
"tiny"
:
return
torch
.
finfo
(
x
.
dtype
).
tiny
elif
self
.
log_zero_guard_value
==
"eps"
:
return
torch
.
finfo
(
x
.
dtype
).
eps
else
:
raise
ValueError
(
f
"
{
self
}
received
{
self
.
log_zero_guard_value
}
for the "
f
"log_zero_guard_type parameter. It must be either a "
f
"number, 'tiny', or 'eps'"
)
else
:
return
self
.
log_zero_guard_value
def
get_seq_len
(
self
,
seq_len
):
# Assuming that center is True is stft_pad_amount = 0
pad_amount
=
(
self
.
stft_pad_amount
*
2
if
self
.
stft_pad_amount
is
not
None
else
self
.
n_fft
//
2
*
2
)
seq_len
=
torch
.
floor_divide
(
(
seq_len
+
pad_amount
-
self
.
n_fft
),
self
.
hop_length
)
return
seq_len
.
to
(
dtype
=
torch
.
long
)
@
property
def
filter_banks
(
self
):
return
self
.
fb
def
splice_frames
(
self
,
x
,
frame_splicing
):
"""Stacks frames together across feature dim
input is batch_size, feature_dim, num_frames
output is batch_size, feature_dim*frame_splicing, num_frames
"""
seq
=
[
x
]
for
n
in
range
(
1
,
frame_splicing
):
seq
.
append
(
torch
.
cat
([
x
[:,
:,
:
n
],
x
[:,
:,
n
:]],
dim
=
2
))
return
torch
.
cat
(
seq
,
dim
=
1
)
def
normalize_batch
(
self
,
x
,
seq_len
,
normalize_type
):
x_mean
=
None
x_std
=
None
if
normalize_type
==
"per_feature"
:
batch_size
=
x
.
shape
[
0
]
max_time
=
x
.
shape
[
2
]
# When doing stream capture to a graph, item() is not allowed
# because it calls cudaStreamSynchronize(). Therefore, we are
# sacrificing some error checking when running with cuda graphs.
# if (
# torch.cuda.is_available()
# and not torch.cuda.is_current_stream_capturing()
# and torch.any(seq_len == 1).item()
# ):
# raise ValueError(
# "normalize_batch with `per_feature` normalize_type "
# "received a tensor of length 1. This will result in "
# "torch.std() returning nan. Make sure your audio length "
# "has enough samples for a single feature (ex. at least "
# "`hop_length` for Mel Spectrograms)."
# )
time_steps
=
(
torch
.
arange
(
max_time
,
device
=
x
.
device
)
.
unsqueeze
(
0
)
.
expand
(
batch_size
,
max_time
)
)
valid_mask
=
time_steps
<
seq_len
.
unsqueeze
(
1
)
x_mean_numerator
=
torch
.
where
(
valid_mask
.
unsqueeze
(
1
),
x
,
0.0
).
sum
(
axis
=
2
)
x_mean_denominator
=
valid_mask
.
sum
(
axis
=
1
)
x_mean
=
x_mean_numerator
/
x_mean_denominator
.
unsqueeze
(
1
)
# Subtract 1 in the denominator to correct for the bias.
x_std
=
torch
.
sqrt
(
torch
.
sum
(
torch
.
where
(
valid_mask
.
unsqueeze
(
1
),
x
-
x_mean
.
unsqueeze
(
2
),
0.0
)
**
2
,
axis
=
2
,
)
/
(
x_mean_denominator
.
unsqueeze
(
1
)
-
1.0
)
)
x_std
=
x_std
.
masked_fill
(
x_std
.
isnan
(),
0.0
)
# edge case: only 1 frame in denominator
# make sure x_std is not zero
x_std
+=
CONSTANT
return
(
x
-
x_mean
.
unsqueeze
(
2
))
/
x_std
.
unsqueeze
(
2
),
x_mean
,
x_std
elif
normalize_type
==
"all_features"
:
x_mean
=
torch
.
zeros
(
seq_len
.
shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
x_std
=
torch
.
zeros
(
seq_len
.
shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
for
i
in
range
(
x
.
shape
[
0
]):
x_mean
[
i
]
=
x
[
i
,
:,
:
seq_len
[
i
].
item
()].
mean
()
x_std
[
i
]
=
x
[
i
,
:,
:
seq_len
[
i
].
item
()].
std
()
# make sure x_std is not zero
x_std
+=
CONSTANT
return
(
x
-
x_mean
.
view
(
-
1
,
1
,
1
))
/
x_std
.
view
(
-
1
,
1
,
1
),
x_mean
,
x_std
elif
"fixed_mean"
in
normalize_type
and
"fixed_std"
in
normalize_type
:
x_mean
=
torch
.
tensor
(
normalize_type
[
"fixed_mean"
],
device
=
x
.
device
)
x_std
=
torch
.
tensor
(
normalize_type
[
"fixed_std"
],
device
=
x
.
device
)
return
(
(
x
-
x_mean
.
view
(
x
.
shape
[
0
],
x
.
shape
[
1
]).
unsqueeze
(
2
))
/
x_std
.
view
(
x
.
shape
[
0
],
x
.
shape
[
1
]).
unsqueeze
(
2
),
x_mean
,
x_std
,
)
else
:
return
x
,
x_mean
,
x_std
@
torch
.
compile
def
forward
(
self
,
x
,
seq_len
,
linear_spec
=
False
):
if
x
.
shape
[
1
]
<
self
.
sample_rate
*
self
.
pad_min_duration
:
pad_amount
=
int
(
self
.
sample_rate
*
self
.
pad_min_duration
)
-
x
.
shape
[
1
]
if
self
.
pad_direction
==
"right"
:
x
=
F
.
pad
(
x
,
(
0
,
pad_amount
),
value
=
self
.
pad_value
)
elif
self
.
pad_direction
==
"left"
:
x
=
F
.
pad
(
x
,
(
pad_amount
,
0
),
value
=
self
.
pad_value
)
elif
self
.
pad_direction
==
"both"
:
left_pad
=
pad_amount
//
2
right_pad
=
pad_amount
-
left_pad
x
=
F
.
pad
(
x
,
(
left_pad
,
right_pad
),
value
=
self
.
pad_value
)
else
:
raise
ValueError
(
f
"
{
self
}
received an invalid pad_direction:
{
self
.
pad_direction
}
. "
f
"It must be one of 'left', 'right', or 'both'."
)
seq_len
=
torch
.
tensor
([
x
.
shape
[
1
]],
dtype
=
torch
.
float
,
device
=
x
.
device
)
seq_len_time
=
seq_len
seq_len_unfixed
=
self
.
get_seq_len
(
seq_len
)
# fix for seq_len = 0 for streaming; if size was 0, it is always padded
# to 1, and normalizer fails
seq_len
=
torch
.
where
(
seq_len
==
0
,
torch
.
zeros_like
(
seq_len_unfixed
),
seq_len_unfixed
)
if
self
.
stft_pad_amount
is
not
None
:
x
=
torch
.
nn
.
functional
.
pad
(
x
.
unsqueeze
(
1
),
(
self
.
stft_pad_amount
,
self
.
stft_pad_amount
),
"constant"
).
squeeze
(
1
)
# use dither for inference as well
if
self
.
dither
>
0
:
x
+=
self
.
dither
*
torch
.
randn
(
x
.
shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
,
generator
=
self
.
generator
)
# do preemphasis
if
self
.
preemph
is
not
None
:
timemask
=
torch
.
arange
(
x
.
shape
[
1
],
device
=
x
.
device
).
unsqueeze
(
0
)
<
seq_len_time
.
unsqueeze
(
1
)
x
=
torch
.
cat
(
(
x
[:,
0
].
unsqueeze
(
1
),
x
[:,
1
:]
-
self
.
preemph
*
x
[:,
:
-
1
]),
dim
=
1
)
x
=
x
.
masked_fill
(
~
timemask
,
0.0
)
x
=
self
.
stft
(
x
)
# torch stft returns complex tensor (of shape [B,N,T]); so convert to magnitude
# guard is needed for sqrt if grads are passed through
guard
=
0
if
not
self
.
use_grads
else
CONSTANT
x
=
torch
.
view_as_real
(
x
)
x
=
torch
.
sqrt
(
x
.
pow
(
2
).
sum
(
-
1
)
+
guard
)
# get power spectrum
if
self
.
mag_power
!=
1.0
:
x
=
x
.
pow
(
self
.
mag_power
)
# return plain spectrogram if required
if
linear_spec
:
return
x
,
seq_len
# disable autocast, otherwise it might be automatically casted to fp16
# on fp16 compatible GPUs and get NaN values for input value of 65520
with
torch
.
amp
.
autocast
(
x
.
device
.
type
,
enabled
=
False
):
# dot with filterbank energies
x
=
torch
.
matmul
(
self
.
fb
.
to
(
x
.
dtype
),
x
)
# log features if required
if
self
.
log
:
if
self
.
log_zero_guard_type
==
"add"
:
x
=
torch
.
log
(
x
+
self
.
log_zero_guard_value_fn
(
x
))
elif
self
.
log_zero_guard_type
==
"clamp"
:
x
=
torch
.
log
(
torch
.
clamp
(
x
,
min
=
self
.
log_zero_guard_value_fn
(
x
)))
else
:
raise
ValueError
(
"log_zero_guard_type was not understood"
)
# frame splicing if required
if
self
.
frame_splicing
>
1
:
x
=
self
.
splice_frames
(
x
,
self
.
frame_splicing
)
# normalize if required
if
self
.
normalize
:
x
,
_
,
_
=
self
.
normalize_batch
(
x
,
seq_len
,
normalize_type
=
self
.
normalize
)
# mask to zero any values beyond seq_len in batch, pad to multiple of
# `pad_to` (for efficiency)
max_len
=
x
.
size
(
-
1
)
mask
=
torch
.
arange
(
max_len
,
device
=
x
.
device
)
mask
=
mask
.
repeat
(
x
.
size
(
0
),
1
)
>=
seq_len
.
unsqueeze
(
1
)
x
=
x
.
masked_fill
(
mask
.
unsqueeze
(
1
).
type
(
torch
.
bool
).
to
(
device
=
x
.
device
),
self
.
pad_value
)
del
mask
pad_to
=
self
.
pad_to
if
pad_to
==
"max"
:
x
=
nn
.
functional
.
pad
(
x
,
(
0
,
self
.
max_length
-
x
.
size
(
-
1
)),
value
=
self
.
pad_value
)
elif
pad_to
>
0
:
pad_amt
=
x
.
size
(
-
1
)
%
pad_to
if
pad_amt
!=
0
:
x
=
nn
.
functional
.
pad
(
x
,
(
0
,
pad_to
-
pad_amt
),
value
=
self
.
pad_value
)
return
x
,
seq_len
class
CohereASRFeatureExtractor
(
SequenceFeatureExtractor
):
"""HF-compatible feature extractor wrapping FilterbankFeatures."""
model_input_names
=
[
"input_features"
]
def
__init__
(
self
,
feature_size
=
64
,
sampling_rate
=
16000
,
padding_value
=
0.0
,
max_duration
=
30
,
n_window_size
=
320
,
n_window_stride
=
160
,
window
=
"hann"
,
normalize
=
"per_feature"
,
n_fft
=
None
,
preemph
=
0.97
,
lowfreq
=
0
,
highfreq
=
None
,
log
=
True
,
log_zero_guard_type
=
"add"
,
log_zero_guard_value
=
2
**-
24
,
dither
=
CONSTANT
,
pad_to
=
16
,
frame_splicing
=
1
,
exact_pad
=
False
,
mag_power
=
2.0
,
nb_augmentation_prob
=
0.0
,
nb_max_freq
=
4000
,
mel_norm
=
"slaney"
,
stft_exact_pad
=
False
,
stft_conv
=
False
,
device
=
"cpu"
,
**
kwargs
,
):
super
().
__init__
(
feature_size
=
feature_size
,
sampling_rate
=
sampling_rate
,
padding_value
=
padding_value
,
**
kwargs
,
)
self
.
max_duration
=
max_duration
self
.
hop_length
=
n_window_stride
self
.
_device
=
torch
.
device
(
device
)
self
.
_fb_config
=
dict
(
sample_rate
=
sampling_rate
,
n_window_size
=
n_window_size
,
n_window_stride
=
n_window_stride
,
window
=
window
,
normalize
=
normalize
,
n_fft
=
n_fft
,
preemph
=
preemph
,
nfilt
=
feature_size
,
lowfreq
=
lowfreq
,
highfreq
=
highfreq
,
log
=
log
,
log_zero_guard_type
=
log_zero_guard_type
,
log_zero_guard_value
=
log_zero_guard_value
,
dither
=
dither
,
pad_to
=
pad_to
,
max_duration
=
max_duration
,
frame_splicing
=
frame_splicing
,
exact_pad
=
exact_pad
,
pad_value
=
padding_value
,
mag_power
=
mag_power
,
nb_augmentation_prob
=
nb_augmentation_prob
,
nb_max_freq
=
nb_max_freq
,
mel_norm
=
mel_norm
,
stft_exact_pad
=
stft_exact_pad
,
stft_conv
=
stft_conv
,
device
=
device
,
)
self
.
_filterbank
:
FilterbankFeatures
|
None
=
None
@
property
def
filterbank
(
self
)
->
FilterbankFeatures
:
if
self
.
_filterbank
is
None
:
fb
=
FilterbankFeatures
(
**
self
.
_fb_config
)
fb
.
eval
()
self
.
_filterbank
=
fb
.
to
(
self
.
_device
)
return
self
.
_filterbank
def
get_seq_len
(
self
,
seq_len
):
return
self
.
filterbank
.
get_seq_len
(
seq_len
)
def
__call__
(
self
,
raw_speech
,
sampling_rate
=
None
,
return_tensors
=
None
,
**
kwargs
,
)
->
BatchFeature
:
if
isinstance
(
raw_speech
,
np
.
ndarray
):
raw_speech
=
[
raw_speech
]
seq_len
=
torch
.
tensor
([
s
.
shape
[
0
]
for
s
in
raw_speech
])
max_len
=
max
(
s
.
shape
[
0
]
for
s
in
raw_speech
)
padded
=
np
.
zeros
((
len
(
raw_speech
),
max_len
),
dtype
=
np
.
float32
)
for
i
,
s
in
enumerate
(
raw_speech
):
padded
[
i
,
:
s
.
shape
[
0
]]
=
s
audio_tensor
=
torch
.
from_numpy
(
padded
).
to
(
self
.
_device
)
seq_len
=
seq_len
.
to
(
self
.
_device
)
with
torch
.
no_grad
():
input_features
,
length
=
self
.
filterbank
(
audio_tensor
,
seq_len
)
result
=
BatchFeature
(
{
"input_features"
:
input_features
.
cpu
(),
"length"
:
length
.
cpu
()}
)
if
return_tensors
is
not
None
:
result
=
result
.
convert_to_tensors
(
return_tensors
)
return
result
class
CohereASRProcessor
(
ProcessorMixin
):
"""HF-compatible processor combining CohereASRFeatureExtractor and a
tokenizer."""
feature_extractor_class
=
"CohereASRFeatureExtractor"
tokenizer_class
=
"AutoTokenizer"
def
__init__
(
self
,
feature_extractor
,
tokenizer
):
super
().
__init__
(
feature_extractor
,
tokenizer
)
def
__call__
(
self
,
text
=
None
,
audio
=
None
,
sampling_rate
=
None
,
return_tensors
=
None
,
**
kwargs
,
):
if
audio
is
not
None
:
result
=
self
.
feature_extractor
(
audio
,
sampling_rate
=
sampling_rate
,
return_tensors
=
return_tensors
,
)
else
:
result
=
BatchFeature
()
if
text
is
not
None
:
text_inputs
=
self
.
tokenizer
(
text
,
return_tensors
=
return_tensors
,
**
kwargs
)
result
[
"input_ids"
]
=
text_inputs
[
"input_ids"
]
return
result
AutoFeatureExtractor
.
register
(
"CohereASRFeatureExtractor"
,
CohereASRFeatureExtractor
)
AutoProcessor
.
register
(
"CohereASRProcessor"
,
CohereASRProcessor
)
vllm/transformers_utils/processors/fireredasr2.py
View file @
0da93439
...
...
@@ -188,7 +188,7 @@ class FireRedASR2FeatureExtractor(SequenceFeatureExtractor):
for
speech
in
raw_speech
:
"""
We must multiply by 32768 here because FireRedASR2 loads audio data
using kaldiio.load_mat, while vLLM loads audio data using
librosa
.
using kaldiio.load_mat, while vLLM loads audio data using
pyav
.
"""
speech
=
speech
*
32768
fbank
=
self
.
fbank
(
sampling_rate
,
speech
)
...
...
vllm/transformers_utils/processors/glm4v.py
View file @
0da93439
...
...
@@ -29,13 +29,8 @@ class GLM4VProcessor(ProcessorMixin):
def
__init__
(
self
,
image_processor
:
GLM4VImageProcessorFast
,
tokenizer
:
PreTrainedTokenizer
,
image_size
:
int
,
image_processor
:
GLM4VImageProcessorFast
|
None
=
None
,
)
->
None
:
self
.
tokenizer
=
tokenizer
if
image_processor
is
None
:
image_processor
=
GLM4VImageProcessorFast
(
size
=
{
"width"
:
image_size
,
"height"
:
image_size
}
)
self
.
image_processor
=
image_processor
self
.
tokenizer
=
tokenizer
vllm/transformers_utils/processors/h2ovl.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
# --------------------------------------------------------
# H2OVL-Mississippi
# Copyright (c) 2024 H2O.AI
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
import
torch
from
PIL
import
Image
from
vllm.tokenizers.hf
import
HfTokenizer
from
.internvl
import
(
InternVLImageProcessor
,
InternVLProcessor
,
build_transform
,
find_closest_aspect_ratio
,
get_internvl_target_ratios
,
)
def
resolve_h2ovl_min_max_num
(
*
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
]:
min_dynamic_patch
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_dynamic_patch
=
max_dynamic_patch
if
dynamic_image_size
else
1
if
use_thumbnail
and
max_dynamic_patch
!=
1
:
max_dynamic_patch
+=
1
return
min_dynamic_patch
,
max_dynamic_patch
def
get_h2ovl_target_ratios
(
min_num
:
int
,
max_num
:
int
,
*
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
list
[
tuple
[
int
,
int
]]:
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
)
# if prior_aspect_ratio is provided, filter the target ratios
if
prior_aspect_ratio
is
not
None
:
target_ratios
=
[
ratio
for
ratio
in
target_ratios
if
prior_aspect_ratio
[
0
]
%
ratio
[
0
]
!=
0
and
prior_aspect_ratio
[
1
]
%
ratio
[
1
]
!=
0
]
return
target_ratios
# modified to include blocks generated in second pass
def
calculate_h2ovl_targets
(
*
,
orig_width
:
int
,
orig_height
:
int
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
int
,
int
,
int
,
tuple
[
int
,
int
]]:
aspect_ratio
=
orig_width
/
orig_height
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
=
orig_width
,
height
=
orig_height
,
image_size
=
image_size
,
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# add thumbnail image if num_blocks != 1
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
,
target_width
,
target_height
,
target_aspect_ratio
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
# refactored to handle prior_aspect_ratio
def
dynamic_preprocess_h2ovl
(
image
:
Image
.
Image
,
*
,
target_ratios
:
list
[
tuple
[
int
,
int
]],
image_size
:
int
,
use_thumbnail
:
bool
,
)
->
tuple
[
list
[
Image
.
Image
],
tuple
[
int
,
int
]]:
orig_width
,
orig_height
=
image
.
size
# calculate the number of blocks without thumbnail
(
blocks
,
target_width
,
target_height
,
target_aspect_ratio
,
)
=
calculate_h2ovl_targets
(
orig_width
=
orig_width
,
orig_height
=
orig_height
,
target_ratios
=
target_ratios
,
image_size
=
image_size
,
use_thumbnail
=
False
,
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
,
target_aspect_ratio
def
_preprocess_image
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
,
)
->
tuple
[
torch
.
Tensor
,
tuple
[
int
,
int
]]:
target_ratios
=
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
transform
=
build_transform
(
input_size
=
input_size
)
images
,
target_aspect_ratio
=
dynamic_preprocess_h2ovl
(
image
,
image_size
=
input_size
,
use_thumbnail
=
use_thumbnail
,
target_ratios
=
target_ratios
,
)
pixel_values
=
torch
.
stack
([
transform
(
image
)
for
image
in
images
])
return
pixel_values
,
target_aspect_ratio
# refactored to use the _preprocess_image function
def
image_to_pixel_values_h2ovl
(
image
:
Image
.
Image
,
*
,
input_size
:
int
,
min_num
:
int
,
max_num
:
int
,
use_thumbnail
:
bool
,
use_msac
:
bool
,
)
->
torch
.
Tensor
:
# when MSAC is turned on, we need to process the image twice
if
use_msac
:
# first pass
pixel_values1
,
aspect_ratio1
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
1
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
None
,
)
# second pass
pixel_values2
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
3
,
max_num
=
max_num
,
use_thumbnail
=
True
,
prior_aspect_ratio
=
aspect_ratio1
,
)
# combine pixel values
pixel_values
=
torch
.
cat
(
[
pixel_values2
[:
-
1
],
pixel_values1
[:
-
1
],
pixel_values2
[
-
1
:]],
0
)
else
:
pixel_values
,
_
=
_preprocess_image
(
image
,
input_size
=
input_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
use_thumbnail
,
prior_aspect_ratio
=
None
,
)
return
pixel_values
class
H2OVLImageProcessor
(
InternVLImageProcessor
):
def
__init__
(
self
,
image_size
:
int
,
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
bool
,
use_thumbnail
:
bool
,
use_msac
:
bool
,
)
->
None
:
super
().
__init__
(
image_size
=
image_size
,
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
self
.
use_msac
=
use_msac
def
resolve_min_max_num
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
)
->
tuple
[
int
,
int
]:
if
min_dynamic_patch
is
None
:
min_dynamic_patch
=
self
.
min_dynamic_patch
if
max_dynamic_patch
is
None
:
max_dynamic_patch
=
self
.
max_dynamic_patch
if
dynamic_image_size
is
None
:
dynamic_image_size
=
self
.
dynamic_image_size
if
use_thumbnail
is
None
:
use_thumbnail
=
self
.
use_thumbnail
return
resolve_h2ovl_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
def
_images_to_pixel_values_lst
(
self
,
images
:
list
[
Image
.
Image
],
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
)
->
list
[
torch
.
Tensor
]:
use_msac
=
self
.
use_msac
if
len
(
images
)
==
1
else
False
min_num
,
max_num
=
self
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
False
,
# Applied in image_to_pixel_values
)
return
[
image_to_pixel_values_h2ovl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
min_num
,
max_num
=
max_num
,
use_thumbnail
=
self
.
use_thumbnail
,
use_msac
=
use_msac
,
)
for
image
in
images
]
class
H2OVLProcessor
(
InternVLProcessor
):
def
__init__
(
self
,
image_processor
:
H2OVLImageProcessor
,
tokenizer
:
HfTokenizer
,
*
,
image_seq_length
:
int
,
start_image_token
:
str
=
"<img>"
,
end_image_token
:
str
=
"</img>"
,
ctx_image_token
:
str
=
"<IMG_CONTEXT>"
,
)
->
None
:
super
().
__init__
(
image_processor
=
image_processor
,
tokenizer
=
tokenizer
,
image_seq_length
=
image_seq_length
,
start_image_token
=
start_image_token
,
end_image_token
=
end_image_token
,
ctx_image_token
=
ctx_image_token
,
)
self
.
image_processor
:
H2OVLImageProcessor
def
resolve_target_ratios
(
self
,
*
,
min_dynamic_patch
:
int
|
None
=
None
,
max_dynamic_patch
:
int
|
None
=
None
,
dynamic_image_size
:
bool
|
None
=
None
,
use_thumbnail
:
bool
|
None
=
None
,
prior_aspect_ratio
:
tuple
[
int
,
int
]
|
None
=
None
,
override_min_num
:
int
|
None
=
None
,
)
->
list
[
tuple
[
int
,
int
]]:
min_num
,
max_num
=
self
.
image_processor
.
resolve_min_max_num
(
min_dynamic_patch
=
min_dynamic_patch
,
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
use_thumbnail
=
use_thumbnail
,
)
if
override_min_num
is
not
None
:
min_num
=
override_min_num
return
get_h2ovl_target_ratios
(
min_num
,
max_num
,
prior_aspect_ratio
=
prior_aspect_ratio
,
)
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
use_msac
:
bool
|
None
=
None
,
)
->
int
:
image_processor
=
self
.
image_processor
use_msac
=
image_processor
.
use_msac
if
use_msac
is
None
else
use_msac
use_thumbnail
=
image_processor
.
use_thumbnail
if
use_msac
:
target_ratios_1
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
override_min_num
=
1
,
)
num_patches_1
,
_
,
_
,
aspect_ratio_1
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios_1
,
use_thumbnail
=
True
,
)
target_ratios_2
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
prior_aspect_ratio
=
aspect_ratio_1
,
override_min_num
=
3
,
)
num_patches_2
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios_2
,
use_thumbnail
=
True
,
)
num_patches
=
num_patches_1
+
num_patches_2
-
1
else
:
target_ratios
=
self
.
resolve_target_ratios
(
use_thumbnail
=
False
,
# Applied in calculate_targets
)
num_patches
,
_
,
_
,
_
=
calculate_h2ovl_targets
(
orig_width
=
image_width
,
orig_height
=
image_height
,
image_size
=
image_processor
.
image_size
,
target_ratios
=
target_ratios
,
use_thumbnail
=
use_thumbnail
,
)
return
num_patches
*
self
.
image_seq_length
Prev
1
…
27
28
29
30
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment