Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1491 additions
and
246 deletions
+1491
-246
tests/models/multimodal/generation/test_whisper.py
tests/models/multimodal/generation/test_whisper.py
+1
-1
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+177
-0
tests/models/multimodal/pooling/conftest.py
tests/models/multimodal/pooling/conftest.py
+9
-15
tests/models/multimodal/pooling/test_intern_vit.py
tests/models/multimodal/pooling/test_intern_vit.py
+3
-1
tests/models/multimodal/pooling/test_jinavl_reranker.py
tests/models/multimodal/pooling/test_jinavl_reranker.py
+313
-137
tests/models/multimodal/pooling/test_radio.py
tests/models/multimodal/pooling/test_radio.py
+13
-9
tests/models/multimodal/pooling/test_siglip.py
tests/models/multimodal/pooling/test_siglip.py
+8
-0
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+27
-2
tests/models/multimodal/processing/test_gemma3.py
tests/models/multimodal/processing/test_gemma3.py
+141
-1
tests/models/multimodal/processing/test_qwen3_omni.py
tests/models/multimodal/processing/test_qwen3_omni.py
+104
-0
tests/models/multimodal/processing/test_tensor_schema.py
tests/models/multimodal/processing/test_tensor_schema.py
+15
-11
tests/models/quantization/untest_fp8.py
tests/models/quantization/untest_fp8.py
+3
-2
tests/models/quantization/untest_gptq_marlin_24.py
tests/models/quantization/untest_gptq_marlin_24.py
+4
-1
tests/models/registry.py
tests/models/registry.py
+130
-31
tests/models/test_initialization.py
tests/models/test_initialization.py
+12
-6
tests/models/test_registry.py
tests/models/test_registry.py
+5
-1
tests/models/test_terratorch.py
tests/models/test_terratorch.py
+1
-1
tests/models/test_vision.py
tests/models/test_vision.py
+4
-3
tests/models/utils.py
tests/models/utils.py
+18
-24
tests/multimodal/test_audio.py
tests/multimodal/test_audio.py
+503
-0
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/generation/test_whisper.py
View file @
7e63ef82
...
@@ -114,7 +114,7 @@ def check_model_available(model: str) -> None:
...
@@ -114,7 +114,7 @@ def check_model_available(model: str) -> None:
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
cpu_model
@
pytest
.
mark
.
cpu_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3-turbo"
)])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3-turbo"
)])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"float"
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
,
False
])
@
create_new_process_for_each_test
(
"spawn"
)
@
create_new_process_for_each_test
(
"spawn"
)
...
...
tests/models/multimodal/generation/vlm_utils/model_utils.py
View file @
7e63ef82
...
@@ -522,6 +522,183 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -522,6 +522,183 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return
hf_model
return
hf_model
def
isaac_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patch HF runner for Isaac:
1) Move processor outputs to model device
2) Ensure IsaacModel.forward returns hidden_states
for compatibility with hidden_states_to_seq_logprobs()
"""
from
perceptron.tensorstream
import
TextType
from
perceptron.tensorstream.ops
import
compute_mrope_pos_tensor
,
modality_mask
from
transformers.modeling_outputs
import
BaseModelOutputWithPast
def
compute_position_ids_input_ids
(
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
Create 3D positional indices for token input.
"""
batch_size
,
seq_length
=
input_ids
.
shape
position_ids
=
torch
.
arange
(
seq_length
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
view
(
1
,
-
1
).
expand
(
batch_size
,
-
1
)
position_ids
=
position_ids
.
unsqueeze
(
2
).
expand
(
-
1
,
-
1
,
3
)
# Add 3D for MRoPE
return
position_ids
model_device
=
next
(
hf_model
.
model
.
parameters
()).
device
# ----------------------------
# 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
# ----------------------------
original_processor
=
hf_model
.
processor
def
patched_processor
(
*
args
,
**
kwargs
):
result
=
original_processor
(
*
args
,
**
kwargs
)
for
k
,
v
in
result
.
data
.
items
():
result
[
k
]
=
v
.
to
(
model_device
)
return
result
hf_model
.
processor
=
patched_processor
tokenizer
=
AutoTokenizer
.
from_pretrained
(
hf_model
.
model_name
,
trust_remote_code
=
True
)
original_generate
=
hf_model
.
model
.
generate
def
patched_generate
(
*
args
,
**
kwargs
):
kwargs
[
"pad_token_id"
]
=
tokenizer
.
eos_token_id
kwargs
[
"eos_token_id"
]
=
tokenizer
.
eos_token_id
return
original_generate
(
*
args
,
**
kwargs
)
hf_model
.
model
.
generate
=
patched_generate
# ----------------------------
# 2) Patch IsaacModel.forward: add hidden_states to the output
# ----------------------------
isaac_model
=
hf_model
.
model
.
model
def
patched_forward
(
self
,
input_ids
=
None
,
tensor_stream
=
None
,
attention_mask
=
None
,
position_ids
=
None
,
modality_tensor
=
None
,
past_key_values
=
None
,
inputs_embeds
=
None
,
use_cache
=
None
,
output_hidden_states
=
None
,
return_dict
=
None
,
cache_position
=
None
,
**
kwargs
,
):
"""
Forward pass with MRoPE position embeddings.
Computes position embeddings once and passes them through all layers.
"""
output_hidden_states
=
(
output_hidden_states
if
output_hidden_states
is
not
None
else
self
.
config
.
output_hidden_states
)
use_cache
=
use_cache
if
use_cache
is
not
None
else
self
.
config
.
use_cache
return_dict
=
(
return_dict
if
return_dict
is
not
None
else
self
.
config
.
use_return_dict
)
# Get inputs
if
tensor_stream
is
not
None
and
inputs_embeds
is
not
None
:
raise
ValueError
(
"You cannot specify both tensor_stream and inputs_embeds"
)
elif
tensor_stream
is
not
None
:
# Embed TensorStream directly
inputs_embeds
=
self
.
embed_stream
(
tensor_stream
)
# Create modality tensor if not provided
if
modality_tensor
is
None
:
modality_tensor
=
modality_mask
(
tensor_stream
)
elif
input_ids
is
not
None
and
inputs_embeds
is
not
None
:
raise
ValueError
(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
elif
input_ids
is
not
None
:
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
# Create text modality tensor if not provided
if
modality_tensor
is
None
:
batch_size
,
seq_length
=
input_ids
.
shape
modality_tensor
=
torch
.
full
(
(
batch_size
,
seq_length
),
TextType
.
text
.
value
,
device
=
input_ids
.
device
,
dtype
=
torch
.
long
,
)
elif
inputs_embeds
is
None
:
raise
ValueError
(
"You have to specify either tensor_stream, input_ids or inputs_embeds"
)
# Create default position_ids if not provided
if
position_ids
is
None
:
if
tensor_stream
is
not
None
:
position_ids
=
compute_mrope_pos_tensor
(
tensor_stream
)
# (B,L,3)
else
:
position_ids
=
compute_position_ids_input_ids
(
input_ids
)
# Compute MRoPE position embeddings if we have custom rotary_emb
cos
,
sin
=
self
.
rotary_emb
(
position_ids
,
modality_tensor
)
cos
=
cos
.
to
(
inputs_embeds
.
dtype
)
sin
=
sin
.
to
(
inputs_embeds
.
dtype
)
# Prepare attention mask
if
attention_mask
is
not
None
:
attention_mask
=
self
.
_update_causal_mask
(
attention_mask
,
inputs_embeds
,
cache_position
,
past_key_values
,
False
)
# Initialize and collect hidden states
hidden_states
=
inputs_embeds
hidden_states_list
:
list
[
torch
.
Tensor
]
=
[]
if
output_hidden_states
:
hidden_states_list
.
append
(
hidden_states
)
for
decoder_layer
in
self
.
layers
:
layer_outputs
=
decoder_layer
(
hidden_states
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
past_key_value
=
past_key_values
,
use_cache
=
use_cache
,
cache_position
=
cache_position
,
position_embeddings
=
(
cos
,
sin
),
**
kwargs
,
)
hidden_states
=
(
layer_outputs
[
0
]
if
isinstance
(
layer_outputs
,
tuple
)
else
layer_outputs
)
if
output_hidden_states
:
hidden_states_list
.
append
(
hidden_states
)
# Final layer norm
hidden_states
=
self
.
norm
(
hidden_states
)
if
output_hidden_states
:
hidden_states_list
.
append
(
hidden_states
)
# Convert to tuple or None
all_hidden_states
=
tuple
(
hidden_states_list
)
if
output_hidden_states
else
None
# Include hiden_states for compatibility with hidden_states_to_seq_logprobs()
return
BaseModelOutputWithPast
(
last_hidden_state
=
hidden_states
,
past_key_values
=
past_key_values
,
hidden_states
=
all_hidden_states
,
)
isaac_model
.
forward
=
types
.
MethodType
(
patched_forward
,
isaac_model
)
return
hf_model
def
skyworkr1v_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
def
skyworkr1v_patch_hf_runner
(
hf_model
:
HfRunner
)
->
HfRunner
:
"""Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
"""Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
...
...
tests/models/multimodal/pooling/conftest.py
View file @
7e63ef82
...
@@ -2,23 +2,17 @@
...
@@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
"""Pytest configuration for vLLM pooling tests."""
import
os
import
pytest
import
warnings
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
def
pytest_collection_modifyitems
(
config
,
items
):
@
pytest
.
fixture
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
def
siglip_attention_config
():
if
not
current_platform
.
is_rocm
():
"""Return attention config for SigLIP tests on ROCm.
return
siglip_tests
=
[
item
for
item
in
items
if
"test_siglip"
in
item
.
nodeid
]
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if
siglip_tests
:
if
current_platform
.
is_rocm
():
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
"FLEX_ATTENTION"
return
{
"backend"
:
"FLEX_ATTENTION"
}
warnings
.
warn
(
return
None
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests"
,
UserWarning
,
stacklevel
=
1
,
)
tests/models/multimodal/pooling/test_intern_vit.py
View file @
7e63ef82
...
@@ -78,7 +78,9 @@ def run_intern_vit_test(
...
@@ -78,7 +78,9 @@ def run_intern_vit_test(
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_models
(
dist_init
,
image_assets
,
model_id
,
dtype
:
str
)
->
None
:
def
test_models
(
default_vllm_config
,
dist_init
,
image_assets
,
model_id
,
dtype
:
str
)
->
None
:
run_intern_vit_test
(
run_intern_vit_test
(
image_assets
,
image_assets
,
model_id
,
model_id
,
...
...
tests/models/multimodal/pooling/test_jinavl_reranker.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
cast
import
pytest
import
pytest
from
transformers
import
AutoModel
from
transformers
import
AutoModel
from
vllm.entrypoints.chat_utils
import
ChatCompletionContentPartImageParam
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionContentPartImageEmbedsParam
,
ChatCompletionContentPartImageParam
,
ChatCompletionContentPartTextParam
,
)
from
vllm.entrypoints.score_utils
import
ScoreMultiModalParam
from
vllm.entrypoints.score_utils
import
ScoreMultiModalParam
from
....conftest
import
HfRunner
,
VllmRunner
from
....conftest
import
HfRunner
,
VllmRunner
model_name
=
"jinaai/jina-reranker-m0"
MODELS
=
[
"jinaai/jina-reranker-m0"
]
mm_processor_kwargs
=
{
MM_PROCESSOR_KWARGS
=
{
"min_pixels"
:
3136
,
"min_pixels"
:
3136
,
"max_pixels"
:
602112
,
"max_pixels"
:
602112
,
}
}
limit_mm_per_prompt
=
{
"image"
:
2
}
LIMIT_MM_PER_PROMPT
=
{
"image"
:
2
}
CHECKPOINT_TO_HF_MAPPER
=
{
"visual."
:
"model.visual."
,
"model."
:
"model.language_model."
,
}
# Shared long text for test data
LONG_TEXT_DOC
=
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements."""
# noqa: E501
# Test data for different scenarios
TEXT_IMAGE_TEST_DATA
=
{
"query"
:
[{
"text"
:
"slm markdown"
}],
"documents"
:
[
{
"image"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
{
"image"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
],
}
TEXT_TEXT_TEST_DATA
=
{
"query"
:
[{
"text"
:
"slm markdown"
}],
"documents"
:
[
{
"text"
:
LONG_TEXT_DOC
},
{
"text"
:
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?"
},
],
}
IMAGE_TEXT_TEST_DATA
=
{
"query"
:
[
{
"image"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}
],
"documents"
:
[
{
"text"
:
LONG_TEXT_DOC
},
{
"text"
:
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?"
},
],
}
IMAGE_IMAGE_TEST_DATA
=
{
"query"
:
[
{
"image"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}
],
"documents"
:
[
{
"image"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
{
"image"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
],
}
def
vllm_reranker
(
TEXT_MIXED_DOCS_TEST_DATA
=
{
"query"
:
[{
"text"
:
"slm markdown"
}],
"documents"
:
[
{
"text"
:
LONG_TEXT_DOC
},
{
"image"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
{
"text"
:
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?"
},
{
"image"
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
],
}
def
_normalize_image
(
image_val
:
str
)
->
str
:
"""Normalize image value to proper format for HF model."""
return
(
image_val
if
image_val
.
startswith
((
"http://"
,
"https://"
))
else
f
"data:image/png;base64,
{
image_val
}
"
)
def
create_score_multimodal_param
(
content_parts
:
list
[
dict
],
)
->
ScoreMultiModalParam
:
"""
Create a ScoreMultiModalParam from a list of content dictionaries.
Each dict supports the following formats:
- Text: {'text': 'content'}
- Image URL: {'image': 'https://...'}
- Image Base64: {'image': 'base64_str'}
"""
formatted_content
=
[]
for
part
in
content_parts
:
if
"text"
in
part
:
formatted_content
.
append
(
ChatCompletionContentPartTextParam
(
type
=
"text"
,
text
=
part
[
"text"
],
)
)
elif
"image"
in
part
:
image_val
=
part
[
"image"
]
if
image_val
.
startswith
((
"http://"
,
"https://"
)):
formatted_content
.
append
(
ChatCompletionContentPartImageParam
(
type
=
"image_url"
,
image_url
=
{
"url"
:
image_val
},
)
)
else
:
formatted_content
.
append
(
ChatCompletionContentPartImageEmbedsParam
(
type
=
"image_embeds"
,
image_embeds
=
image_val
)
)
return
ScoreMultiModalParam
(
content
=
formatted_content
)
def
_run_vllm
(
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
model
_name
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
query_strs
:
list
[
str
],
query_strs
:
list
[
dict
[
str
,
str
]],
document_strs
:
list
[
str
],
document_strs
:
list
[
dict
[
str
,
str
]],
query_type
:
str
=
"text"
,
)
->
list
[
float
]:
doc_type
:
str
=
"text"
,
"""Run vLLM reranker and return scores."""
):
query
=
create_score_multimodal_param
(
query_strs
)
def
create_image_param
(
url
:
str
)
->
ChatCompletionContentPartImageParam
:
documents
=
create_score_multimodal_param
(
document_strs
)
return
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"
{
url
}
"
}}
query
:
list
[
str
]
|
ScoreMultiModalParam
if
query_type
==
"text"
:
query
=
query_strs
elif
query_type
==
"image"
:
query
=
ScoreMultiModalParam
(
content
=
[
create_image_param
(
url
)
for
url
in
query_strs
]
)
documents
:
list
[
str
]
|
ScoreMultiModalParam
if
doc_type
==
"text"
:
documents
=
document_strs
elif
doc_type
==
"image"
:
documents
=
ScoreMultiModalParam
(
content
=
[
create_image_param
(
url
)
for
url
in
document_strs
]
)
with
vllm_runner
(
with
vllm_runner
(
model
_name
,
model
,
runner
=
"pooling"
,
runner
=
"pooling"
,
dtype
=
dtype
,
dtype
=
dtype
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
max_model_len
=
2048
,
max_model_len
=
2048
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_kwargs
=
MM_PROCESSOR_KWARGS
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
)
as
vllm_model
:
outputs
=
vllm_model
.
llm
.
score
(
query
,
documents
)
outputs
=
vllm_model
.
llm
.
score
(
query
,
documents
)
return
[
output
.
outputs
.
score
for
output
in
outputs
]
return
[
output
.
outputs
.
score
for
output
in
outputs
]
def
hf_reranker
(
def
_run_hf
(
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
model
_name
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
query_strs
:
list
[
str
],
query_strs
:
list
[
dict
[
str
,
str
]],
document_strs
:
list
[
str
],
document_strs
:
list
[
dict
[
str
,
str
]],
query_type
:
str
=
"text"
,
)
->
list
[
float
]:
doc_type
:
str
=
"text"
,
"""Run HuggingFace reranker and return scores."""
):
query
=
query_strs
[
0
]
checkpoint_to_hf_mapper
=
{
if
"text"
in
query
:
"visual."
:
"model.visual."
,
query_type
=
"text"
"model."
:
"model.language_model."
,
query_data
=
query
[
"text"
]
}
elif
"image"
in
query
:
query_type
=
"image"
data_pairs
=
[[
query_strs
[
0
],
d
]
for
d
in
document_strs
]
query_data
=
_normalize_image
(
query
[
"image"
])
else
:
raise
ValueError
(
"Unsupported query format"
)
# Separate documents by type
text_docs
:
list
[
str
]
=
[]
image_docs
:
list
[
str
]
=
[]
text_indices
:
list
[
int
]
=
[]
image_indices
:
list
[
int
]
=
[]
for
idx
,
doc
in
enumerate
(
document_strs
):
if
"text"
in
doc
:
text_docs
.
append
(
doc
[
"text"
])
text_indices
.
append
(
idx
)
elif
"image"
in
doc
:
image_docs
.
append
(
_normalize_image
(
doc
[
"image"
]))
image_indices
.
append
(
idx
)
else
:
raise
ValueError
(
f
"Unsupported document format at index
{
idx
}
"
)
scores
:
list
[
None
|
float
]
=
[
None
]
*
len
(
document_strs
)
with
hf_runner
(
with
hf_runner
(
model
_name
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
auto_cls
=
AutoModel
,
auto_cls
=
AutoModel
,
model_kwargs
=
{
"key_mapping"
:
checkpoint_to_hf_mapper
},
model_kwargs
=
{
"key_mapping"
:
CHECKPOINT_TO_HF_MAPPER
},
)
as
hf_model
:
)
as
hf_model
:
return
hf_model
.
model
.
compute_score
(
# Score text documents
data_pairs
,
max_length
=
2048
,
query_type
=
query_type
,
doc_type
=
doc_type
if
text_docs
:
)
text_scores
=
hf_model
.
model
.
compute_score
(
[[
query_data
,
d
]
for
d
in
text_docs
],
max_length
=
2048
,
query_type
=
query_type
,
doc_type
=
"text"
,
)
for
i
,
s
in
zip
(
text_indices
,
text_scores
):
scores
[
i
]
=
s
# Score image documents
if
image_docs
:
image_scores
=
hf_model
.
model
.
compute_score
(
[[
query_data
,
d
]
for
d
in
image_docs
],
max_length
=
2048
,
query_type
=
query_type
,
doc_type
=
"image"
,
)
for
i
,
s
in
zip
(
image_indices
,
image_scores
):
scores
[
i
]
=
s
# Visual Documents Reranking
assert
all
(
s
is
not
None
for
s
in
scores
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
model_name
])
return
cast
(
list
[
float
],
scores
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_text_image
(
hf_runner
,
vllm_runner
,
model_name
,
dtype
):
query
=
[
"slm markdown"
]
documents
=
[
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
,
]
hf_outputs
=
hf_reranker
(
hf_runner
,
model_name
,
dtype
,
query
,
documents
,
"text"
,
"image"
)
vllm_outputs
=
vllm_reranker
(
vllm_runner
,
model_name
,
dtype
,
query
,
documents
,
"text"
,
"image"
)
assert
hf_outputs
[
0
]
==
pytest
.
approx
(
vllm_outputs
[
0
],
rel
=
0.02
)
assert
hf_outputs
[
1
]
==
pytest
.
approx
(
vllm_outputs
[
1
],
rel
=
0.02
)
def
_run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
dtype
:
str
,
query_strs
:
list
[
dict
[
str
,
str
]],
document_strs
:
list
[
dict
[
str
,
str
]],
)
->
None
:
"""Run comparison test between vLLM and HuggingFace implementations."""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# Textual Documents Reranking
vllm_outputs
=
_run_vllm
(
vllm_runner
,
model
,
dtype
,
query_strs
,
document_strs
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
model_name
])
hf_outputs
=
_run_hf
(
hf_runner
,
model
,
dtype
,
query_strs
,
document_strs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_text_text
(
hf_runner
,
vllm_runner
,
model_name
,
dtype
):
# Compare outputs
query
=
[
"slm markdown"
]
assert
len
(
hf_outputs
)
==
len
(
vllm_outputs
),
(
documents
=
[
f
"Output length mismatch: HF=
{
len
(
hf_outputs
)
}
, vLLM=
{
len
(
vllm_outputs
)
}
"
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements."""
,
# noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?"
,
]
hf_outputs
=
hf_reranker
(
hf_runner
,
model_name
,
dtype
,
query
,
documents
,
"text"
,
"text"
)
vllm_outputs
=
vllm_reranker
(
vllm_runner
,
model_name
,
dtype
,
query
,
documents
,
"text"
,
"text"
)
)
assert
hf_outputs
[
0
]
==
pytest
.
approx
(
vllm_outputs
[
0
],
rel
=
0.02
)
for
i
,
(
hf_score
,
vllm_score
)
in
enumerate
(
zip
(
hf_outputs
,
vllm_outputs
)):
assert
hf_outputs
[
1
]
==
pytest
.
approx
(
vllm_outputs
[
1
],
rel
=
0.02
)
assert
hf_score
==
pytest
.
approx
(
vllm_score
,
rel
=
0.02
),
(
f
"Score mismatch at index
{
i
}
: HF=
{
hf_score
}
, vLLM=
{
vllm_score
}
"
)
# Image Querying for Textual Documents
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
model_name
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_image_text
(
hf_runner
,
vllm_runner
,
model_name
,
dtype
):
def
test_model_text_image
(
query
=
[
hf_runner
,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
vllm_runner
,
]
model
:
str
,
documents
=
[
dtype
:
str
,
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
)
->
None
:
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
"""Visual Documents Reranking"""
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
_run_test
(
large language models. The models effectiveness results from two key innovations: (1) a three-stage
hf_runner
,
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
vllm_runner
,
refining, and critiquing web content extraction; and (2) a unified training framework combining
model
,
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
dtype
,
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
TEXT_IMAGE_TEST_DATA
[
"query"
],
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
TEXT_IMAGE_TEST_DATA
[
"documents"
],
lower computational requirements."""
,
# noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?"
,
]
hf_outputs
=
hf_reranker
(
hf_runner
,
model_name
,
dtype
,
query
,
documents
,
"image"
,
"text"
)
vllm_outputs
=
vllm_reranker
(
vllm_runner
,
model_name
,
dtype
,
query
,
documents
,
"image"
,
"text"
)
)
assert
hf_outputs
[
0
]
==
pytest
.
approx
(
vllm_outputs
[
0
],
rel
=
0.02
)
assert
hf_outputs
[
1
]
==
pytest
.
approx
(
vllm_outputs
[
1
],
rel
=
0.02
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_text_text
(
hf_runner
,
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
"""Textual Documents Reranking"""
_run_test
(
hf_runner
,
vllm_runner
,
model
,
dtype
,
TEXT_TEXT_TEST_DATA
[
"query"
],
TEXT_TEXT_TEST_DATA
[
"documents"
],
)
# Image Querying for Image Documents
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
model_name
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_image_image
(
hf_runner
,
vllm_runner
,
model_name
,
dtype
):
def
test_model_image_text
(
query
=
[
hf_runner
,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
vllm_runner
,
]
model
:
str
,
documents
=
[
dtype
:
str
,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
,
)
->
None
:
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
,
"""Image Querying for Textual Documents"""
]
_run_test
(
hf_runner
,
hf_outputs
=
hf_reranker
(
vllm_runner
,
hf_runner
,
model_name
,
dtype
,
query
,
documents
,
"image"
,
"image"
model
,
dtype
,
IMAGE_TEXT_TEST_DATA
[
"query"
],
IMAGE_TEXT_TEST_DATA
[
"documents"
],
)
)
vllm_outputs
=
vllm_reranker
(
vllm_runner
,
model_name
,
dtype
,
query
,
documents
,
"image"
,
"image"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_image_image
(
hf_runner
,
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
"""Image Querying for Image Documents"""
_run_test
(
hf_runner
,
vllm_runner
,
model
,
dtype
,
IMAGE_IMAGE_TEST_DATA
[
"query"
],
IMAGE_IMAGE_TEST_DATA
[
"documents"
],
)
)
assert
hf_outputs
[
0
]
==
pytest
.
approx
(
vllm_outputs
[
0
],
rel
=
0.02
)
assert
hf_outputs
[
1
]
==
pytest
.
approx
(
vllm_outputs
[
1
],
rel
=
0.02
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_text_mixed_documents
(
hf_runner
,
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
"""Text Query for Mixed Text and Image Documents"""
_run_test
(
hf_runner
,
vllm_runner
,
model
,
dtype
,
TEXT_MIXED_DOCS_TEST_DATA
[
"query"
],
TEXT_MIXED_DOCS_TEST_DATA
[
"documents"
],
)
tests/models/multimodal/pooling/test_radio.py
View file @
7e63ef82
...
@@ -40,15 +40,15 @@ def run_radio_test(
...
@@ -40,15 +40,15 @@ def run_radio_test(
for
image
in
images
for
image
in
images
]
]
config
=
AutoConfig
.
from_pretrained
(
model_id
,
trust_remote_code
=
True
)
hf_
config
=
AutoConfig
.
from_pretrained
(
model_id
,
trust_remote_code
=
True
)
# RADIO model on HF does not properly handle torch_dtype argument
# RADIO model on HF does not properly handle torch_dtype argument
# And relies on args["dtype"] which we have to patch manually:
# And relies on args["dtype"] which we have to patch manually:
config
.
args
[
"dtype"
]
=
torch_dtype
hf_
config
.
args
[
"dtype"
]
=
torch_dtype
hf_model
=
AutoModel
.
from_pretrained
(
hf_model
=
AutoModel
.
from_pretrained
(
model_id
,
model_id
,
config
=
config
,
config
=
hf_
config
,
dtype
=
torch_dtype
,
dtype
=
torch_dtype
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
).
to
(
"cuda"
)
).
to
(
"cuda"
)
...
@@ -62,13 +62,14 @@ def run_radio_test(
...
@@ -62,13 +62,14 @@ def run_radio_test(
hf_model
.
make_preprocessor_external
()
hf_model
.
make_preprocessor_external
()
hf_outputs_per_image
=
[
hf_outputs_per_image
=
[
hf_model
(
pixel_value
.
to
(
"cuda"
))
.
features
for
pixel_value
in
pixel_values
hf_model
(
pixel_value
.
to
(
"cuda"
))
for
pixel_value
in
pixel_values
]
]
radio_config
=
RadioConfig
(
vllm_config
=
RadioConfig
(
model_name
=
config
.
args
[
"model"
],
reg_tokens
=
config
.
args
[
"register_multiple"
]
model_name
=
hf_config
.
args
[
"model"
],
**
hf_config
.
args
,
)
)
vllm_model
=
RadioModel
(
radio
_config
)
vllm_model
=
RadioModel
(
vllm
_config
)
vllm_model
.
load_weights
(
hf_model
.
state_dict
())
vllm_model
.
load_weights
(
hf_model
.
state_dict
())
vllm_model
=
vllm_model
.
to
(
"cuda"
,
torch_dtype
)
vllm_model
=
vllm_model
.
to
(
"cuda"
,
torch_dtype
)
...
@@ -80,7 +81,8 @@ def run_radio_test(
...
@@ -80,7 +81,8 @@ def run_radio_test(
cos_similar
=
nn
.
CosineSimilarity
(
dim
=-
1
)
cos_similar
=
nn
.
CosineSimilarity
(
dim
=-
1
)
for
vllm_output
,
hf_output
in
zip
(
vllm_outputs_per_image
,
hf_outputs_per_image
):
for
vllm_output
,
hf_output
in
zip
(
vllm_outputs_per_image
,
hf_outputs_per_image
):
assert
cos_similar
(
vllm_output
,
hf_output
).
mean
()
>
0.99
assert
cos_similar
(
vllm_output
[
0
],
hf_output
[
0
]).
mean
()
>
0.99
assert
cos_similar
(
vllm_output
[
1
],
hf_output
[
1
]).
mean
()
>
0.99
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -90,7 +92,9 @@ def run_radio_test(
...
@@ -90,7 +92,9 @@ def run_radio_test(
],
],
)
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"bfloat16"
])
def
test_radio
(
dist_init
,
image_assets
,
model_id
,
dtype
:
str
)
->
None
:
def
test_radio
(
default_vllm_config
,
dist_init
,
image_assets
,
model_id
,
dtype
:
str
)
->
None
:
run_radio_test
(
run_radio_test
(
image_assets
,
image_assets
,
model_id
,
model_id
,
...
...
tests/models/multimodal/pooling/test_siglip.py
View file @
7e63ef82
...
@@ -38,6 +38,7 @@ def _run_test(
...
@@ -38,6 +38,7 @@ def _run_test(
*
,
*
,
dtype
:
str
,
dtype
:
str
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
attention_config
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
None
:
)
->
None
:
if
tokenization_kwargs
is
None
:
if
tokenization_kwargs
is
None
:
tokenization_kwargs
=
{}
tokenization_kwargs
=
{}
...
@@ -49,6 +50,7 @@ def _run_test(
...
@@ -49,6 +50,7 @@ def _run_test(
enforce_eager
=
True
,
enforce_eager
=
True
,
max_model_len
=
64
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
,
attention_config
=
attention_config
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
,
tokenization_kwargs
=
tokenization_kwargs
input_texts
,
images
=
input_images
,
tokenization_kwargs
=
tokenization_kwargs
...
@@ -90,6 +92,7 @@ def test_models_text(
...
@@ -90,6 +92,7 @@ def test_models_text(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
image_assets
,
image_assets
,
siglip_attention_config
,
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
...
@@ -108,6 +111,7 @@ def test_models_text(
...
@@ -108,6 +111,7 @@ def test_models_text(
"padding"
:
"max_length"
,
"padding"
:
"max_length"
,
"max_length"
:
64
,
"max_length"
:
64
,
},
# siglip2 was trained with this padding setting.
},
# siglip2 was trained with this padding setting.
attention_config
=
siglip_attention_config
,
)
)
...
@@ -117,6 +121,7 @@ def test_models_image(
...
@@ -117,6 +121,7 @@ def test_models_image(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
image_assets
,
image_assets
,
siglip_attention_config
,
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
...
@@ -133,6 +138,7 @@ def test_models_image(
...
@@ -133,6 +138,7 @@ def test_models_image(
input_images
,
input_images
,
model
,
model
,
dtype
=
dtype
,
dtype
=
dtype
,
attention_config
=
siglip_attention_config
,
)
)
...
@@ -141,6 +147,7 @@ def test_models_image(
...
@@ -141,6 +147,7 @@ def test_models_image(
def
test_models_text_image_no_crash
(
def
test_models_text_image_no_crash
(
vllm_runner
,
vllm_runner
,
image_assets
,
image_assets
,
siglip_attention_config
,
model
:
str
,
model
:
str
,
dtype
:
str
,
dtype
:
str
,
)
->
None
:
)
->
None
:
...
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
...
@@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager
=
True
,
enforce_eager
=
True
,
max_model_len
=
64
,
max_model_len
=
64
,
gpu_memory_utilization
=
0.7
,
gpu_memory_utilization
=
0.7
,
attention_config
=
siglip_attention_config
,
)
as
vllm_model
:
)
as
vllm_model
:
with
pytest
.
raises
(
ValueError
,
match
=
"not both"
):
with
pytest
.
raises
(
ValueError
,
match
=
"not both"
):
vllm_model
.
embed
(
texts
,
images
=
images
)
vllm_model
.
embed
(
texts
,
images
=
images
)
...
...
tests/models/multimodal/processing/test_common.py
View file @
7e63ef82
...
@@ -86,11 +86,25 @@ def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
...
@@ -86,11 +86,25 @@ def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
return
mm_data
return
mm_data
def
glmasr_patch_mm_data
(
mm_data
:
MultiModalDataDict
)
->
MultiModalDataDict
:
"""
Patch the multimodal data for GLM-ASR model.
GLM-ASR requires text and audio to match 1:1, so we limit audio to 1.
"""
if
"audio"
in
mm_data
:
audio
=
mm_data
[
"audio"
]
if
isinstance
(
audio
,
list
)
and
len
(
audio
)
>
1
:
# Limit to single audio to match text requirement
mm_data
[
"audio"
]
=
[
audio
[
0
]]
return
mm_data
# For some multimodal models, tokenizer will always add bos_token
# For some multimodal models, tokenizer will always add bos_token
# at the beginning of prompt by default, causing hf_processor outputs
# at the beginning of prompt by default, causing hf_processor outputs
# incorrect token ids. So we need use `add_special_tokens=False` here
# incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor.
# to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES
=
{
_ADD_SPECIAL_TOKENS_OVERRIDES
=
{
"nemotron_parse"
:
False
,
"ovis"
:
False
,
"ovis"
:
False
,
"ovis2_5"
:
False
,
"ovis2_5"
:
False
,
"paligemma"
:
False
,
"paligemma"
:
False
,
...
@@ -106,9 +120,11 @@ _IGNORE_MM_KEYS = {
...
@@ -106,9 +120,11 @@ _IGNORE_MM_KEYS = {
}
}
MM_DATA_PATCHES
=
{
MM_DATA_PATCHES
=
{
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
# Ernie4.5-VL, GLM4.1V and Qwen3-VL requires video metadata
"ernie4_5_moe_vl"
:
qwen3_vl_patch_mm_data
,
"glm4v"
:
glm4_1v_patch_mm_data
,
"glm4v"
:
glm4_1v_patch_mm_data
,
"glm4v_moe"
:
glm4_1v_patch_mm_data
,
"glm4v_moe"
:
glm4_1v_patch_mm_data
,
"glmasr"
:
glmasr_patch_mm_data
,
"qwen3_vl"
:
qwen3_vl_patch_mm_data
,
"qwen3_vl"
:
qwen3_vl_patch_mm_data
,
"qwen3_vl_moe"
:
qwen3_vl_patch_mm_data
,
"qwen3_vl_moe"
:
qwen3_vl_patch_mm_data
,
}
}
...
@@ -212,7 +228,11 @@ def _test_processing_correctness(
...
@@ -212,7 +228,11 @@ def _test_processing_correctness(
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id_or_arch
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id_or_arch
)
model_id
=
model_id_or_arch
model_id
=
model_id_or_arch
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
,
check_max_version
=
False
,
check_version_reason
=
"vllm"
,
)
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model_id
,
model_id
,
...
@@ -386,6 +406,11 @@ def test_processing_correctness(
...
@@ -386,6 +406,11 @@ def test_processing_correctness(
pytest
.
skip
(
"Fix later"
)
pytest
.
skip
(
"Fix later"
)
if
model_id
==
"jinaai/jina-reranker-m0"
:
if
model_id
==
"jinaai/jina-reranker-m0"
:
pytest
.
skip
(
"Fix later"
)
pytest
.
skip
(
"Fix later"
)
if
model_id
in
{
"Qwen/Qwen-VL"
,
"Qwen/Qwen-VL-Chat"
}:
pytest
.
skip
(
"Qwen-VL tokenizer requires downloading a font file from "
"servers that often refuse connections in CI"
)
_test_processing_correctness
(
_test_processing_correctness
(
model_id
,
model_id
,
...
...
tests/models/multimodal/processing/test_gemma3.py
View file @
7e63ef82
...
@@ -2,14 +2,154 @@
...
@@ -2,14 +2,154 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
from
vllm.model_executor.models.gemma3n_audio_utils
import
(
adjust_audio_features_to_expected_length
,
)
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
....conftest
import
ImageTestAssets
from
....conftest
import
ImageTestAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
# Gemma3 (image) model
GEMMA3_MODEL_ID
=
"google/gemma-3-4b-it"
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"google/gemma-3-4b-it"
])
# Gemma3n (multimodal with audio) model
GEMMA3N_MODEL_ID
=
"google/gemma-3n-E2B-it"
# Expected audio tokens for Gemma3n (audio_soft_tokens_per_image)
GEMMA3N_EXPECTED_AUDIO_TOKENS
=
188
class
TestGemma3nAudioTensorLogic
:
"""CPU-based tests for Gemma3n audio feature tensor manipulation.
These tests validate the padding/truncation logic in
adjust_audio_features_to_expected_length() which fixes the
integer overflow in _process_audio_input when audio_seq_len > 188.
"""
def
test_padding_when_audio_short
(
self
):
"""Test that short audio is padded to expected length."""
batch_size
,
seq_len
,
embed_dim
=
1
,
100
,
256
expected_tokens
=
GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features
=
torch
.
randn
(
batch_size
,
seq_len
,
embed_dim
)
padding_embs
=
torch
.
zeros
(
1
,
1
,
embed_dim
)
result
,
tokens_truncated
=
adjust_audio_features_to_expected_length
(
audio_features
,
expected_tokens
,
padding_embs
)
assert
result
.
shape
==
(
batch_size
,
expected_tokens
,
embed_dim
)
assert
tokens_truncated
==
0
# First 100 tokens should be original, rest should be padding (zeros)
assert
torch
.
allclose
(
result
[:,
:
seq_len
,
:],
audio_features
)
assert
torch
.
allclose
(
result
[:,
seq_len
:,
:],
torch
.
zeros
(
batch_size
,
expected_tokens
-
seq_len
,
embed_dim
),
)
def
test_truncation_when_audio_long
(
self
):
"""Test that long audio is truncated to expected length.
This is the key test for the overflow fix. Previously, when
audio_seq_len > expected_tokens, the code would compute a negative
padding value causing: RuntimeError: numel: integer multiplication overflow
"""
batch_size
,
seq_len
,
embed_dim
=
1
,
192
,
256
# 192 > 188
expected_tokens
=
GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features
=
torch
.
randn
(
batch_size
,
seq_len
,
embed_dim
)
padding_embs
=
torch
.
zeros
(
1
,
1
,
embed_dim
)
result
,
tokens_truncated
=
adjust_audio_features_to_expected_length
(
audio_features
,
expected_tokens
,
padding_embs
)
assert
result
.
shape
==
(
batch_size
,
expected_tokens
,
embed_dim
)
assert
tokens_truncated
==
seq_len
-
expected_tokens
# 192 - 188 = 4
# Result should be first 188 tokens of original
assert
torch
.
allclose
(
result
,
audio_features
[:,
:
expected_tokens
,
:])
def
test_no_change_when_exact_length
(
self
):
"""Test that exact-length audio passes through unchanged."""
batch_size
,
embed_dim
=
1
,
256
expected_tokens
=
GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features
=
torch
.
randn
(
batch_size
,
expected_tokens
,
embed_dim
)
padding_embs
=
torch
.
zeros
(
1
,
1
,
embed_dim
)
result
,
tokens_truncated
=
adjust_audio_features_to_expected_length
(
audio_features
,
expected_tokens
,
padding_embs
)
assert
result
.
shape
==
audio_features
.
shape
assert
tokens_truncated
==
0
assert
torch
.
allclose
(
result
,
audio_features
)
def
test_original_bug_would_fail
(
self
):
"""Verify the original buggy implementation would cause overflow.
The original code always tried to pad, which fails when
audio_seq_len > expected_tokens because expand() gets negative size.
"""
batch_size
,
seq_len
,
embed_dim
=
1
,
192
,
256
expected_tokens
=
GEMMA3N_EXPECTED_AUDIO_TOKENS
padding_embs
=
torch
.
zeros
(
1
,
1
,
embed_dim
)
# Original buggy logic (always pads, never truncates)
extra_padding_tokens
=
expected_tokens
-
seq_len
# = -4 (negative!)
with
pytest
.
raises
(
RuntimeError
):
# This should fail with negative size error
padding_embs
.
expand
(
batch_size
,
extra_padding_tokens
,
embed_dim
)
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
50
,
100
,
150
,
187
,
188
,
189
,
192
,
200
,
300
],
)
def
test_various_audio_lengths
(
self
,
seq_len
:
int
):
"""Test padding/truncation with various audio lengths."""
batch_size
,
embed_dim
=
1
,
256
expected_tokens
=
GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features
=
torch
.
randn
(
batch_size
,
seq_len
,
embed_dim
)
padding_embs
=
torch
.
zeros
(
1
,
1
,
embed_dim
)
# Should not raise any errors
result
,
tokens_truncated
=
adjust_audio_features_to_expected_length
(
audio_features
,
expected_tokens
,
padding_embs
)
# Output should always be expected_tokens length
assert
result
.
shape
==
(
batch_size
,
expected_tokens
,
embed_dim
)
# Verify truncation count is correct
if
seq_len
>
expected_tokens
:
assert
tokens_truncated
==
seq_len
-
expected_tokens
else
:
assert
tokens_truncated
==
0
def
test_batch_processing
(
self
):
"""Test that batch processing works correctly."""
batch_size
,
seq_len
,
embed_dim
=
4
,
192
,
256
expected_tokens
=
GEMMA3N_EXPECTED_AUDIO_TOKENS
audio_features
=
torch
.
randn
(
batch_size
,
seq_len
,
embed_dim
)
padding_embs
=
torch
.
zeros
(
1
,
1
,
embed_dim
)
result
,
tokens_truncated
=
adjust_audio_features_to_expected_length
(
audio_features
,
expected_tokens
,
padding_embs
)
assert
result
.
shape
==
(
batch_size
,
expected_tokens
,
embed_dim
)
assert
tokens_truncated
==
seq_len
-
expected_tokens
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
GEMMA3_MODEL_ID
])
def
test_get_image_size_with_most_features
(
def
test_get_image_size_with_most_features
(
image_assets
:
ImageTestAssets
,
model_id
:
str
image_assets
:
ImageTestAssets
,
model_id
:
str
):
):
...
...
tests/models/multimodal/processing/test_qwen3_omni.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Qwen3 Omni audio processing and sample rate handling."""
from
typing
import
Any
import
numpy
as
np
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
...utils
import
build_model_context
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"Qwen/Qwen3-Omni-30B-A3B-Instruct"
])
@
pytest
.
mark
.
parametrize
(
(
"audio_sample_rate"
,
"audio_duration_sec"
),
[
(
16000
,
1.0
),
# Native Whisper sample rate, 1 second
(
16000
,
2.0
),
# Native Whisper sample rate, 2 seconds
],
)
def
test_processor_with_audio_sample_rate
(
model_id
:
str
,
audio_sample_rate
:
int
,
audio_duration_sec
:
float
,
)
->
None
:
"""
Test that vLLM's processor generates expected outputs with audio_sample_rate.
This validates that the processor correctly handles audio_sample_rate
passed via hf_processor_mm_kwargs and generates audio tokens.
"""
ctx
=
build_model_context
(
model_id
,
limit_mm_per_prompt
=
{
"audio"
:
1
,
"image"
:
0
,
"video"
:
0
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
tokenizer
=
processor
.
info
.
get_tokenizer
()
# Create audio data at the specified sample rate
audio_length
=
int
(
audio_sample_rate
*
audio_duration_sec
)
rng
=
np
.
random
.
RandomState
(
42
)
audio_data
=
rng
.
rand
(
audio_length
).
astype
(
np
.
float32
)
# Build prompt with audio placeholder
prompt
=
"<|audio_start|><|audio_pad|><|audio_end|>"
mm_data
=
{
"audio"
:
[(
audio_data
,
audio_sample_rate
)]}
# Apply processor with audio_sample_rate in mm_kwargs
hf_processor_mm_kwargs
:
dict
[
str
,
Any
]
=
{
"audio_sample_rate"
:
audio_sample_rate
,
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
# Verify audio tokens are generated
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
audio_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
audio_token
)
aud_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
audio_token_id
)
assert
aud_tok_count
>=
1
,
(
f
"Expected at least 1 audio token but got
{
aud_tok_count
}
. "
f
"sample_rate:
{
audio_sample_rate
}
Hz, duration:
{
audio_duration_sec
}
s"
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"Qwen/Qwen3-Omni-30B-A3B-Instruct"
])
def
test_longer_audio_generates_more_tokens
(
model_id
:
str
)
->
None
:
"""
Test that longer audio generates more tokens than shorter audio.
This validates that audio_sample_rate is being used correctly by checking
that audio duration affects token count as expected.
"""
ctx
=
build_model_context
(
model_id
,
limit_mm_per_prompt
=
{
"audio"
:
1
,
"image"
:
0
,
"video"
:
0
},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
tokenizer
=
processor
.
info
.
get_tokenizer
()
audio_sample_rate
=
16000
rng
=
np
.
random
.
RandomState
(
42
)
def
get_token_count
(
duration
:
float
)
->
int
:
audio_length
=
int
(
audio_sample_rate
*
duration
)
audio_data
=
rng
.
rand
(
audio_length
).
astype
(
np
.
float32
)
prompt
=
"<|audio_start|><|audio_pad|><|audio_end|>"
mm_data
=
{
"audio"
:
[(
audio_data
,
audio_sample_rate
)]}
hf_processor_mm_kwargs
:
dict
[
str
,
Any
]
=
{
"audio_sample_rate"
:
audio_sample_rate
,
}
processed
=
processor
.
apply
(
prompt
,
mm_data
,
hf_processor_mm_kwargs
)
hf_proc
=
processor
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
audio_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_proc
.
audio_token
)
return
processed
[
"prompt_token_ids"
].
count
(
audio_token_id
)
short_tokens
=
get_token_count
(
1.0
)
long_tokens
=
get_token_count
(
2.0
)
assert
long_tokens
>
short_tokens
,
(
f
"Expected longer audio (2s) to have more tokens than shorter (1s). "
f
"Got short=
{
short_tokens
}
, long=
{
long_tokens
}
"
)
tests/models/multimodal/processing/test_tensor_schema.py
View file @
7e63ef82
...
@@ -138,25 +138,25 @@ def create_batched_mm_kwargs(
...
@@ -138,25 +138,25 @@ def create_batched_mm_kwargs(
)
)
# TODO(Isotr0py): Don't initalize model during test
# TODO(Isotr0py): Don't init
i
alize model during test
@
contextmanager
@
contextmanager
def
initialize_dummy_model
(
def
initialize_dummy_model
(
model_cls
:
type
[
nn
.
Module
],
model_cls
:
type
[
nn
.
Module
],
model_config
:
ModelConfig
,
model_config
:
ModelConfig
,
):
):
temp_file
=
tempfile
.
mkstemp
()[
1
]
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
tensor_model_parallel_size
=
1
)
current_device
=
torch
.
get_default_device
()
current_device
=
torch
.
get_default_device
()
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
)
with
set_current_vllm_config
(
vllm_config
=
vllm_config
):
with
set_current_vllm_config
(
vllm_config
=
vllm_config
):
init_distributed_environment
(
world_size
=
1
,
rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
local_rank
=
0
,
backend
=
"nccl"
,
)
initialize_model_parallel
(
tensor_model_parallel_size
=
1
)
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
set_default_torch_dtype
(
model_config
.
dtype
):
torch
.
set_default_device
(
current_platform
.
device_type
)
torch
.
set_default_device
(
current_platform
.
device_type
)
model
=
model_cls
(
vllm_config
=
vllm_config
)
model
=
model_cls
(
vllm_config
=
vllm_config
)
...
@@ -172,7 +172,11 @@ def initialize_dummy_model(
...
@@ -172,7 +172,11 @@ def initialize_dummy_model(
def
test_model_tensor_schema
(
model_id
:
str
):
def
test_model_tensor_schema
(
model_id
:
str
):
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
,
check_max_version
=
False
,
check_version_reason
=
"vllm"
,
)
model_arch
=
next
(
model_arch
=
next
(
arch
for
arch
,
info
in
HF_EXAMPLE_MODELS
.
hf_models
.
items
()
if
info
==
model_info
arch
for
arch
,
info
in
HF_EXAMPLE_MODELS
.
hf_models
.
items
()
if
info
==
model_info
...
...
tests/models/quantization/untest_fp8.py
View file @
7e63ef82
...
@@ -9,7 +9,7 @@ import os
...
@@ -9,7 +9,7 @@ import os
import
pytest
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.attention.
util
s.fa_utils
import
flash_attn_supports_fp8
from
vllm.
v1.
attention.
backend
s.fa_utils
import
flash_attn_supports_fp8
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
..utils
import
check_logprobs_close
from
..utils
import
check_logprobs_close
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
...
@@ -76,7 +76,6 @@ def test_models(
...
@@ -76,7 +76,6 @@ def test_models(
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"TOKENIZERS_PARALLELISM"
,
"true"
)
m
.
setenv
(
"TOKENIZERS_PARALLELISM"
,
"true"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
backend
)
MAX_MODEL_LEN
=
1024
MAX_MODEL_LEN
=
1024
NUM_LOG_PROBS
=
8
NUM_LOG_PROBS
=
8
...
@@ -87,6 +86,7 @@ def test_models(
...
@@ -87,6 +86,7 @@ def test_models(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
"auto"
,
kv_cache_dtype
=
"auto"
,
attention_config
=
{
"backend"
:
backend
},
)
as
vllm_model
:
)
as
vllm_model
:
baseline_outputs
=
vllm_model
.
generate_greedy_logprobs
(
baseline_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
...
@@ -98,6 +98,7 @@ def test_models(
...
@@ -98,6 +98,7 @@ def test_models(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
attention_config
=
{
"backend"
:
backend
},
)
as
vllm_model
:
)
as
vllm_model
:
test_outputs
=
vllm_model
.
generate_greedy_logprobs
(
test_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
...
...
tests/models/quantization/untest_gptq_marlin_24.py
View file @
7e63ef82
...
@@ -65,7 +65,10 @@ def test_models(
...
@@ -65,7 +65,10 @@ def test_models(
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
None
:
)
->
None
:
with
vllm_runner
(
with
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
quantization
=
"gptq_marlin_24"
model_pair
.
model_marlin
,
dtype
=
dtype
,
quantization
=
"gptq_marlin_24"
,
allow_deprecated_quantization
=
True
,
)
as
marlin_24_model
:
)
as
marlin_24_model
:
marlin_24_outputs
=
marlin_24_model
.
generate_greedy_logprobs
(
marlin_24_outputs
=
marlin_24_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
example_prompts
,
max_tokens
,
num_logprobs
...
...
tests/models/registry.py
View file @
7e63ef82
...
@@ -51,9 +51,11 @@ class _HfExamplesInfo:
...
@@ -51,9 +51,11 @@ class _HfExamplesInfo:
The maximum version of HF Transformers that this model runs on.
The maximum version of HF Transformers that this model runs on.
"""
"""
transformers_version_reason
:
str
|
None
=
None
transformers_version_reason
:
dict
[
Literal
[
"vllm"
,
"hf"
],
str
]
|
None
=
None
"""
"""
The reason for the minimum/maximum version requirement.
The type and reason to skip test for the minimum/maximum version requirement.
vllm: skip all vLLM tests if the version requirement is not met.
hf: only skip tests that uses HF runner if the version requirement is not met.
"""
"""
require_embed_inputs
:
bool
=
False
require_embed_inputs
:
bool
=
False
...
@@ -113,6 +115,7 @@ class _HfExamplesInfo:
...
@@ -113,6 +115,7 @@ class _HfExamplesInfo:
self
,
self
,
*
,
*
,
on_fail
:
Literal
[
"error"
,
"skip"
,
"return"
],
on_fail
:
Literal
[
"error"
,
"skip"
,
"return"
],
check_version_reason
:
Literal
[
"vllm"
,
"hf"
]
=
"hf"
,
check_min_version
:
bool
=
True
,
check_min_version
:
bool
=
True
,
check_max_version
:
bool
=
True
,
check_max_version
:
bool
=
True
,
)
->
str
|
None
:
)
->
str
|
None
:
...
@@ -133,23 +136,28 @@ class _HfExamplesInfo:
...
@@ -133,23 +136,28 @@ class _HfExamplesInfo:
msg
=
f
"`transformers==
{
current_version
}
` installed, but `transformers"
msg
=
f
"`transformers==
{
current_version
}
` installed, but `transformers"
# Only check the base version for the min/max version, otherwise preview
# Only check the base version for the min/max version, otherwise preview
# models cannot be run because `x.yy.0.dev0`<`x.yy.0`
# models cannot be run because `x.yy.0.dev0`<`x.yy.0`
if
(
if
min_version
and
Version
(
cur_base_version
)
<
Version
(
min_version
):
check_min_version
is_version_valid
=
not
check_min_version
and
min_version
and
Version
(
cur_base_version
)
<
Version
(
min_version
)
):
msg
+=
f
">=
{
min_version
}
` is required to run this model."
msg
+=
f
">=
{
min_version
}
` is required to run this model."
elif
(
elif
max_version
and
Version
(
cur_base_version
)
>
Version
(
max_version
):
check_max_version
is_version_valid
=
not
check_max_version
and
max_version
and
Version
(
cur_base_version
)
>
Version
(
max_version
)
):
msg
+=
f
"<=
{
max_version
}
` is required to run this model."
msg
+=
f
"<=
{
max_version
}
` is required to run this model."
else
:
else
:
return
Non
e
is_version_valid
=
Tru
e
if
self
.
transformers_version_reason
:
# check if Transformers version breaks the corresponding model runner,
msg
+=
f
" Reason:
{
self
.
transformers_version_reason
}
"
# skip test when model runner not compatible
is_reason_valid
=
not
(
check_version_reason
and
self
.
transformers_version_reason
and
check_version_reason
in
self
.
transformers_version_reason
)
is_transformers_valid
=
is_version_valid
and
is_reason_valid
if
is_transformers_valid
:
return
None
elif
self
.
transformers_version_reason
:
for
reason_type
,
reason
in
self
.
transformers_version_reason
.
items
():
msg
+=
f
" Reason(
{
reason_type
}
):
{
reason
}
"
if
on_fail
==
"error"
:
if
on_fail
==
"error"
:
raise
RuntimeError
(
msg
)
raise
RuntimeError
(
msg
)
...
@@ -219,7 +227,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -219,7 +227,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
),
),
"CwmForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/cwm"
),
min_transformers_version
=
"4.58"
),
"CwmForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/cwm"
),
min_transformers_version
=
"4.58"
),
"DbrxForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"databricks/dbrx-instruct"
)),
# FIXME: databricks/dbrx-instruct has been deleted
"DbrxForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"databricks/dbrx-instruct"
),
is_available_online
=
False
),
"DeciLMForCausalLM"
:
_HfExamplesInfo
(
"DeciLMForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Llama-3_3-Nemotron-Super-49B-v1"
),
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Llama-3_3-Nemotron-Super-49B-v1"
),
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -243,6 +254,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -243,6 +254,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
),
trust_remote_code
=
True
),
),
"Exaone4ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-4.0-32B"
)),
"Exaone4ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-4.0-32B"
)),
"ExaoneMoEForCausalLM"
:
_HfExamplesInfo
(
"LGAI-EXAONE/K-EXAONE-236B-A23B"
,
min_transformers_version
=
"5.0.0"
),
"Fairseq2LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"mgleize/fairseq2-dummy-Llama-3.2-1B"
)),
"Fairseq2LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"mgleize/fairseq2-dummy-Llama-3.2-1B"
)),
"FalconForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
)),
"FalconForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
)),
"FalconH1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/Falcon-H1-0.5B-Base"
)),
"FalconH1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/Falcon-H1-0.5B-Base"
)),
...
@@ -282,6 +296,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -282,6 +296,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Grok1ModelForCausalLM"
:
_HfExamplesInfo
(
"Grok1ModelForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"hpcai-tech/grok-1"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"hpcai-tech/grok-1"
),
trust_remote_code
=
True
),
),
"Grok1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"xai-org/grok-2"
),
trust_remote_code
=
True
),
"HunYuanDenseV1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tencent/Hunyuan-7B-Instruct"
)),
"HunYuanDenseV1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tencent/Hunyuan-7B-Instruct"
)),
"HunYuanMoEV1ForCausalLM"
:
_HfExamplesInfo
(
"HunYuanMoEV1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tencent/Hunyuan-A13B-Instruct"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"tencent/Hunyuan-A13B-Instruct"
),
trust_remote_code
=
True
...
@@ -302,6 +317,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -302,6 +317,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"Jais2ForCausalLM"
:
_HfExamplesInfo
(
"Jais2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"inceptionai/Jais-2-8B-Chat"
),
min_transformers_version
=
"4.58"
os
.
path
.
join
(
models_path_prefix
,
"inceptionai/Jais-2-8B-Chat"
),
min_transformers_version
=
"4.58"
),
),
"IQuestCoderForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"IQuestLab/IQuest-Coder-V1-40B-Instruct"
),
trust_remote_code
=
True
),
"IQuestLoopCoderForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct"
),
trust_remote_code
=
True
),
"JAISLMHeadModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"inceptionai/jais-13b-chat"
)),
"Jais2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"inceptionai/Jais-2-8B-Chat"
),
min_transformers_version
=
"4.58"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
"JambaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/AI21-Jamba-1.5-Mini"
),
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/AI21-Jamba-1.5-Mini"
),
extras
=
{
extras
=
{
...
@@ -348,6 +373,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -348,6 +373,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"MiniCPM3ForCausalLM"
:
_HfExamplesInfo
(
"MiniCPM3ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM3-4B"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM3-4B"
),
trust_remote_code
=
True
),
),
"MiniCPM4ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM4.1-8B"
),
trust_remote_code
=
True
),
"MiniMaxForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"MiniMaxAI/MiniMax-Text-01-hf"
)),
"MiniMaxForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"MiniMaxAI/MiniMax-Text-01-hf"
)),
"MiniMaxText01ForCausalLM"
:
_HfExamplesInfo
(
"MiniMaxText01ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"MiniMaxAI/MiniMax-Text-01"
),
os
.
path
.
join
(
models_path_prefix
,
"MiniMaxAI/MiniMax-Text-01"
),
...
@@ -370,7 +398,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -370,7 +398,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"TitanML/tiny-mixtral"
)},
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"TitanML/tiny-mixtral"
)},
),
),
"MptForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"mpt"
),
is_available_online
=
False
),
"MptForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"mpt"
),
is_available_online
=
False
),
"MPTForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"mosaicml/mpt-7b"
)),
# FIXME: mosaicml/mpt-7b has been deleted
"MPTForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"mosaicml/mpt-7b"
),
is_available_online
=
False
),
"NemotronForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Minitron-8B-Base"
)),
"NemotronForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Minitron-8B-Base"
)),
"NemotronHForCausalLM"
:
_HfExamplesInfo
(
"NemotronHForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Nemotron-H-8B-Base-8K"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Nemotron-H-8B-Base-8K"
),
trust_remote_code
=
True
...
@@ -394,6 +423,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -394,6 +423,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"PanguEmbeddedForCausalLM"
:
_HfExamplesInfo
(
"PanguEmbeddedForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"FreedomIntelligence/openPangu-Embedded-7B-V1.1"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"FreedomIntelligence/openPangu-Embedded-7B-V1.1"
),
trust_remote_code
=
True
),
),
"PanguProMoEV2ForCausalLM"
:
_HfExamplesInfo
(
""
,
trust_remote_code
=
True
,
is_available_online
=
False
,
),
"PanguUltraMoEForCausalLM"
:
_HfExamplesInfo
(
"PanguUltraMoEForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1"
),
os
.
path
.
join
(
models_path_prefix
,
"FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1"
),
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -416,7 +450,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -416,7 +450,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"QWenLMHeadModel"
:
_HfExamplesInfo
(
"QWenLMHeadModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-7B-Chat"
),
max_transformers_version
=
"4.53"
,
max_transformers_version
=
"4.53"
,
transformers_version_reason
=
"HF model uses remote code that is not compatible with latest Transformers"
,
# noqa: E501
transformers_version_reason
=
{
"hf"
:
"HF model uses remote code that is not compatible with latest Transformers"
# noqa: E501
},
trust_remote_code
=
True
,
trust_remote_code
=
True
,
),
),
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
...
@@ -463,6 +499,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -463,6 +499,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
),
),
"Zamba2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Zyphra/Zamba2-7B-instruct"
)),
"Zamba2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Zyphra/Zamba2-7B-instruct"
)),
"MiMoForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
),
trust_remote_code
=
True
),
"MiMoForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
),
trust_remote_code
=
True
),
"MiMoV2FlashForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-V2-Flash"
),
trust_remote_code
=
True
),
"Dots1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"rednote-hilab/dots.llm1.inst"
)),
"Dots1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"rednote-hilab/dots.llm1.inst"
)),
}
}
...
@@ -484,7 +523,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
...
@@ -484,7 +523,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-1_8b-reward"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-1_8b-reward"
),
trust_remote_code
=
True
),
),
"JambaForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-reward-dev"
)),
"JambaForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-reward-dev"
)),
"LlamaModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"llama"
,
is_available_online
=
False
)),
"LlamaModel"
:
_HfExamplesInfo
(
"llama"
,
is_available_online
=
False
),
"LlamaBidirectionalModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nvidia/llama-nemotron-embed-1b-v2"
),
trust_remote_code
=
True
),
"MistralModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)),
"MistralModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)),
"ModernBertModel"
:
_HfExamplesInfo
(
"ModernBertModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-modernbert-base"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-modernbert-base"
),
trust_remote_code
=
True
...
@@ -496,12 +538,16 @@ _EMBEDDING_EXAMPLE_MODELS = {
...
@@ -496,12 +538,16 @@ _EMBEDDING_EXAMPLE_MODELS = {
"Qwen2ForRewardModel"
:
_HfExamplesInfo
(
"Qwen2ForRewardModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-RM-72B"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-RM-72B"
),
max_transformers_version
=
"4.53"
,
max_transformers_version
=
"4.53"
,
transformers_version_reason
=
"HF model uses remote code that is not compatible with latest Transformers"
,
# noqa: E501
transformers_version_reason
=
{
"hf"
:
"HF model uses remote code that is not compatible with latest Transformers"
# noqa: E501
},
),
),
"Qwen2ForProcessRewardModel"
:
_HfExamplesInfo
(
"Qwen2ForProcessRewardModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-PRM-7B"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-PRM-7B"
),
max_transformers_version
=
"4.53"
,
max_transformers_version
=
"4.53"
,
transformers_version_reason
=
"HF model uses remote code that is not compatible with latest Transformers"
,
# noqa: E501
transformers_version_reason
=
{
"hf"
:
"HF model uses remote code that is not compatible with latest Transformers"
# noqa: E501
},
),
),
"RobertaModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/stsb-roberta-base-v2"
)),
"RobertaModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/stsb-roberta-base-v2"
)),
"RobertaForMaskedLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-roberta-large-v1"
)),
"RobertaForMaskedLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-roberta-large-v1"
)),
...
@@ -551,6 +597,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
...
@@ -551,6 +597,9 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GteNewForSequenceClassification"
]},
hf_overrides
=
{
"architectures"
:
[
"GteNewForSequenceClassification"
]},
),
),
"LlamaBidirectionalForSequenceClassification"
:
_HfExamplesInfo
(
"nvidia/llama-nemotron-rerank-1b-v2"
,
trust_remote_code
=
True
),
"ModernBertForSequenceClassification"
:
_HfExamplesInfo
(
"ModernBertForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-reranker-modernbert-base"
)
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-reranker-modernbert-base"
)
),
),
...
@@ -581,6 +630,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
...
@@ -581,6 +630,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
)
os
.
path
.
join
(
models_path_prefix
,
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
)
),
),
"Qwen3ForTokenClassification"
:
_HfExamplesInfo
(
"bd2lcco/Qwen3-0.6B-finetuned"
),
"Qwen3ForTokenClassification"
:
_HfExamplesInfo
(
"bd2lcco/Qwen3-0.6B-finetuned"
),
"Qwen3VLForSequenceClassification"
:
_HfExamplesInfo
(
"Qwen/Qwen3-VL-Reranker-2B"
,
is_available_online
=
False
,
hf_overrides
=
{
"architectures"
:
[
"Qwen3VLForSequenceClassification"
],
"classifier_from_token"
:
[
"no"
,
"yes"
],
"is_original_qwen3_reranker"
:
True
,
},
),
}
}
_MULTIMODAL_EXAMPLE_MODELS
=
{
_MULTIMODAL_EXAMPLE_MODELS
=
{
...
@@ -607,7 +665,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -607,7 +665,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/deepseek-vl2-tiny"
),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/deepseek-vl2-tiny"
),
extras
=
{
"fork"
:
os
.
path
.
join
(
models_path_prefix
,
"Isotr0py/deepseek-vl2-tiny"
)},
extras
=
{
"fork"
:
os
.
path
.
join
(
models_path_prefix
,
"Isotr0py/deepseek-vl2-tiny"
)},
max_transformers_version
=
"4.48"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"HF model is not compatible."
,
transformers_version_reason
=
{
"hf"
:
"HF model is not compatible."
}
,
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]},
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]},
),
),
"DeepseekOCRForCausalLM"
:
_HfExamplesInfo
(
"DeepseekOCRForCausalLM"
:
_HfExamplesInfo
(
...
@@ -624,6 +682,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -624,6 +682,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"FuyuForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
)),
"FuyuForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"adept/fuyu-8b"
)),
"Gemma3ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-3-4b-it"
)),
"Gemma3ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-3-4b-it"
)),
"Gemma3nForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-3n-E2B-it"
)),
"Gemma3nForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-3n-E2B-it"
)),
"GlmAsrForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"zai-org/GLM-ASR-Nano-2512"
),
trust_remote_code
=
True
,
min_transformers_version
=
"5.0"
,
),
"GraniteSpeechForConditionalGeneration"
:
_HfExamplesInfo
(
"GraniteSpeechForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-speech-3.3-2b"
)
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-speech-3.3-2b"
)
),
),
...
@@ -639,7 +702,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -639,7 +702,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
extras
=
{
"2b"
:
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-2b"
)},
extras
=
{
"2b"
:
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-2b"
)},
max_transformers_version
=
"4.48"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"HF model is not compatible."
,
transformers_version_reason
=
{
"hf"
:
"HF model is not compatible."
}
,
),
),
"HCXVisionForCausalLM"
:
_HfExamplesInfo
(
"HCXVisionForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
),
os
.
path
.
join
(
models_path_prefix
,
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
),
...
@@ -653,6 +716,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -653,6 +716,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
),
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
),
extras
=
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceTB/SmolVLM-256M-Instruct"
)},
extras
=
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceTB/SmolVLM-256M-Instruct"
)},
),
),
"IsaacForConditionalGeneration"
:
_HfExamplesInfo
(
"PerceptronAI/Isaac-0.1"
,
trust_remote_code
=
True
,
extras
=
{
"0.2-2B-Preview"
:
"PerceptronAI/Isaac-0.2-2B-Preview"
},
),
"InternS1ForConditionalGeneration"
:
_HfExamplesInfo
(
"InternS1ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"internlm/Intern-S1"
),
trust_remote_code
=
True
os
.
path
.
join
(
models_path_prefix
,
"internlm/Intern-S1"
),
trust_remote_code
=
True
),
),
...
@@ -668,6 +736,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -668,6 +736,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
),
),
"InternVLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL3-1B-hf"
)),
"InternVLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL3-1B-hf"
)),
"KananaVForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"kakaocorp/kanana-1.5-v-3b-instruct"
),
trust_remote_code
=
True
,
),
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Kwai-Keye/Keye-VL-8B-Preview"
),
os
.
path
.
join
(
models_path_prefix
,
"Kwai-Keye/Keye-VL-8B-Preview"
),
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
@@ -681,13 +753,21 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -681,13 +753,21 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras
=
{
"thinking"
:
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Thinking"
)},
extras
=
{
"thinking"
:
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Thinking"
)},
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.53.3"
,
max_transformers_version
=
"4.53.3"
,
transformers_version_reason
=
"HF model uses deprecated transformers API "
transformers_version_reason
=
{
"(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
"hf"
:
(
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
,
"HF model uses deprecated transformers API "
"(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
"https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31"
)
},
),
),
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
"LightOnOCRForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"lightonai/LightOnOCR-1B-1025"
)
os
.
path
.
join
(
models_path_prefix
,
"lightonai/LightOnOCR-1B-1025"
)
),
),
"Lfm2VlForConditionalGeneration"
:
_HfExamplesInfo
(
"LiquidAI/LFM2-VL-450M"
,
min_transformers_version
=
"5.0.0"
,
),
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
),
max_model_len
=
10240
,
max_model_len
=
10240
,
...
@@ -712,7 +792,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -712,7 +792,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MantisForConditionalGeneration"
:
_HfExamplesInfo
(
"MantisForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
),
os
.
path
.
join
(
models_path_prefix
,
"TIGER-Lab/Mantis-8B-siglip-llama3"
),
max_transformers_version
=
"4.48"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"HF model is not compatible."
,
transformers_version_reason
=
{
"hf"
:
"HF model is not compatible."
}
,
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]},
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]},
),
),
"MiDashengLMModel"
:
_HfExamplesInfo
(
"MiDashengLMModel"
:
_HfExamplesInfo
(
...
@@ -739,7 +819,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -739,7 +819,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"MolmoForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-D-0924"
),
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-D-0924"
),
max_transformers_version
=
"4.48"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Incorrectly-detected `tensorflow` import."
,
transformers_version_reason
=
{
"vllm"
:
"Incorrectly-detected `tensorflow` import from processor."
},
extras
=
{
"olmo"
:
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-O-0924"
)},
extras
=
{
"olmo"
:
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-O-0924"
)},
trust_remote_code
=
True
,
trust_remote_code
=
True
,
),
),
...
@@ -758,7 +840,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -758,7 +840,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis2-1B"
),
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis2-1B"
),
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.53"
,
max_transformers_version
=
"4.53"
,
transformers_version_reason
=
"HF model is not compatible"
,
transformers_version_reason
=
{
"hf"
:
"HF model is not compatible"
}
,
extras
=
{
extras
=
{
"1.6-llama"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
),
"1.6-llama"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
),
"1.6-gemma"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Gemma2-9B"
),
"1.6-gemma"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Gemma2-9B"
),
...
@@ -777,7 +859,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -777,7 +859,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
),
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.48"
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
transformers_version_reason
=
{
"hf"
:
"HF model use deprecated imports which have been removed."
},
# noqa: E501
extras
=
{
"phi3.5"
:
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)},
extras
=
{
"phi3.5"
:
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)},
),
),
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
"Phi4MMForCausalLM"
:
_HfExamplesInfo
(
...
@@ -796,7 +880,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -796,7 +880,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras
=
{
"chat"
:
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
)},
extras
=
{
"chat"
:
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
)},
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.53.3"
,
max_transformers_version
=
"4.53.3"
,
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
transformers_version_reason
=
{
"hf"
:
"HF model uses deprecated imports which have been removed."
},
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"QwenVLForConditionalGeneration"
]},
hf_overrides
=
{
"architectures"
:
[
"QwenVLForConditionalGeneration"
]},
),
),
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
...
@@ -851,7 +937,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -851,7 +937,15 @@ _MULTIMODAL_EXAMPLE_MODELS = {
# disable this temporarily until we support HF format
# disable this temporarily until we support HF format
is_available_online
=
False
,
is_available_online
=
False
,
),
),
"VoxtralStreamingGeneration"
:
_HfExamplesInfo
(
"<place-holder>"
,
# disable this temporarily until we support HF format
is_available_online
=
False
,
),
# [Encoder-decoder]
# [Encoder-decoder]
"NemotronParseForConditionalGeneration"
:
_HfExamplesInfo
(
"nvidia/NVIDIA-Nemotron-Parse-v1.1"
,
trust_remote_code
=
True
),
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3-turbo"
),
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3-turbo"
),
extras
=
{
"v3"
:
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3"
)},
extras
=
{
"v3"
:
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3"
)},
...
@@ -926,6 +1020,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
...
@@ -926,6 +1020,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
trust_remote_code
=
True
,
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"baidu/ERNIE-4.5-21B-A3B-PT"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"baidu/ERNIE-4.5-21B-A3B-PT"
),
),
),
"ExaoneMoeMTP"
:
_HfExamplesInfo
(
"LGAI-EXAONE/K-EXAONE-236B-A23B"
,
speculative_model
=
"LGAI-EXAONE/K-EXAONE-236B-A23B"
,
min_transformers_version
=
"5.0.0"
,
),
"Glm4MoeMTPModel"
:
_HfExamplesInfo
(
"Glm4MoeMTPModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"zai-org/GLM-4.5"
),
os
.
path
.
join
(
models_path_prefix
,
"zai-org/GLM-4.5"
),
speculative_model
=
"zai-org/GLM-4.5"
,
speculative_model
=
"zai-org/GLM-4.5"
,
...
...
tests/models/test_initialization.py
View file @
7e63ef82
...
@@ -66,7 +66,11 @@ def can_initialize(
...
@@ -66,7 +66,11 @@ def can_initialize(
model_info
=
EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
=
EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
,
check_max_version
=
False
,
check_version_reason
=
"vllm"
,
)
hf_overrides_fn
=
partial
(
hf_overrides_fn
=
partial
(
dummy_hf_overrides
,
dummy_hf_overrides
,
...
@@ -108,11 +112,12 @@ def can_initialize(
...
@@ -108,11 +112,12 @@ def can_initialize(
patch
.
object
(
V1EngineCore
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v1
),
patch
.
object
(
V1EngineCore
,
"_initialize_kv_caches"
,
_initialize_kv_caches_v1
),
monkeypatch
.
context
()
as
m
,
monkeypatch
.
context
()
as
m
,
):
):
if
model_arch
==
"GptOssForCausalLM"
:
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
# L4 supports FA3.
attention_config
=
(
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
{
"backend"
:
"TRITON_ATTN"
}
if
model_arch
==
"GptOssForCausalLM"
else
None
)
if
model_arch
==
"WhisperForConditionalGeneration"
:
if
model_arch
==
"WhisperForConditionalGeneration"
:
m
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
m
.
setenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"spawn"
)
...
@@ -143,6 +148,7 @@ def can_initialize(
...
@@ -143,6 +148,7 @@ def can_initialize(
else
"vllm"
,
else
"vllm"
,
hf_overrides
=
hf_overrides_fn
,
hf_overrides
=
hf_overrides_fn
,
max_num_seqs
=
model_info
.
max_num_seqs
,
max_num_seqs
=
model_info
.
max_num_seqs
,
attention_config
=
attention_config
,
)
)
...
...
tests/models/test_registry.py
View file @
7e63ef82
...
@@ -34,7 +34,11 @@ models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_M
...
@@ -34,7 +34,11 @@ models_path_prefix = os.getenv('VLLM_OPTEST_MODELS_PATH') or os.getenv("OPTEST_M
def
test_registry_imports
(
model_arch
):
def
test_registry_imports
(
model_arch
):
# Skip if transformers version is incompatible
# Skip if transformers version is incompatible
model_info
=
HF_EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
=
HF_EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
,
check_max_version
=
False
,
check_version_reason
=
"vllm"
,
)
# Ensure all model classes can be imported successfully
# Ensure all model classes can be imported successfully
model_cls
=
ModelRegistry
.
_try_load_model_cls
(
model_arch
)
model_cls
=
ModelRegistry
.
_try_load_model_cls
(
model_arch
)
assert
model_cls
is
not
None
assert
model_cls
is
not
None
...
...
tests/models/test_terratorch.py
View file @
7e63ef82
...
@@ -38,7 +38,7 @@ def test_inference(
...
@@ -38,7 +38,7 @@ def test_inference(
max_num_seqs
=
32
,
max_num_seqs
=
32
,
default_torch_num_threads
=
1
,
default_torch_num_threads
=
1
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_output
=
vllm_model
.
llm
.
encode
(
prompt
)
vllm_output
=
vllm_model
.
llm
.
encode
(
prompt
,
pooling_task
=
"plugin"
)
assert
torch
.
equal
(
assert
torch
.
equal
(
torch
.
isnan
(
vllm_output
[
0
].
outputs
.
data
).
any
(),
torch
.
tensor
(
False
)
torch
.
isnan
(
vllm_output
[
0
].
outputs
.
data
).
any
(),
torch
.
tensor
(
False
)
)
)
tests/models/test_vision.py
View file @
7e63ef82
...
@@ -21,6 +21,7 @@ from vllm.model_executor.models.vision import (
...
@@ -21,6 +21,7 @@ from vllm.model_executor.models.vision import (
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.network_utils
import
get_open_port
from
vllm.utils.network_utils
import
get_open_port
from
vllm.utils.system_utils
import
update_environment_variables
from
vllm.utils.system_utils
import
update_environment_variables
from
vllm.utils.torch_utils
import
set_random_seed
pytestmark
=
pytest
.
mark
.
cpu_test
pytestmark
=
pytest
.
mark
.
cpu_test
...
@@ -98,7 +99,7 @@ def run_dp_sharded_vision_model_vs_direct(
...
@@ -98,7 +99,7 @@ def run_dp_sharded_vision_model_vs_direct(
"""
"""
# Set random seed for reproducibility
# Set random seed for reproducibility
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
device
=
f
"
{
current_platform
.
device_name
}
:
{
local_rank
}
"
device
=
f
"
{
current_platform
.
device_name
}
:
{
local_rank
}
"
current_platform
.
set_device
(
device
)
current_platform
.
set_device
(
device
)
...
@@ -284,7 +285,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(
...
@@ -284,7 +285,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(
calling the model directly.
calling the model directly.
"""
"""
# Set random seed for reproducibility
# Set random seed for reproducibility
current_platform
.
seed_everything
(
0
)
set_random_seed
(
0
)
device
=
f
"
{
current_platform
.
device_name
}
:
{
local_rank
}
"
device
=
f
"
{
current_platform
.
device_name
}
:
{
local_rank
}
"
current_platform
.
set_device
(
device
)
current_platform
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
...
@@ -408,7 +409,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
...
@@ -408,7 +409,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
):
):
"""Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
"""Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
# Set up distributed environment
# Set up distributed environment
current_platform
.
seed_everything
(
123
)
set_random_seed
(
123
)
device
=
f
"
{
current_platform
.
device_name
}
:
{
local_rank
}
"
device
=
f
"
{
current_platform
.
device_name
}
:
{
local_rank
}
"
current_platform
.
set_device
(
device
)
current_platform
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
...
...
tests/models/utils.py
View file @
7e63ef82
...
@@ -10,7 +10,8 @@ import torch
...
@@ -10,7 +10,8 @@ import torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
vllm.config.model
import
ModelConfig
,
ModelDType
,
RunnerOption
from
vllm.config.model
import
AttnTypeStr
,
ModelConfig
,
ModelDType
,
RunnerOption
from
vllm.config.pooler
import
SequencePoolingType
,
TokenPoolingType
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.logprobs
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.multimodal.processing
import
InputProcessingContext
from
vllm.multimodal.processing
import
InputProcessingContext
from
vllm.tokenizers
import
cached_tokenizer_from_config
from
vllm.tokenizers
import
cached_tokenizer_from_config
...
@@ -292,7 +293,11 @@ def build_model_context(
...
@@ -292,7 +293,11 @@ def build_model_context(
"""
"""
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
,
check_max_version
=
False
,
check_version_reason
=
"vllm"
,
)
model_config_kwargs
=
model_config_kwargs
or
{}
model_config_kwargs
=
model_config_kwargs
or
{}
limit_mm_per_prompt
=
limit_mm_per_prompt
or
{}
limit_mm_per_prompt
=
limit_mm_per_prompt
or
{}
...
@@ -375,7 +380,11 @@ class ModelInfo:
...
@@ -375,7 +380,11 @@ class ModelInfo:
max_model_len
:
int
|
None
=
None
max_model_len
:
int
|
None
=
None
hf_dtype
:
str
=
"float32"
hf_dtype
:
str
=
"float32"
hf_overrides
:
dict
[
str
,
Any
]
|
None
=
None
hf_overrides
:
dict
[
str
,
Any
]
|
None
=
None
default_pooling_type
:
str
=
""
seq_pooling_type
:
SequencePoolingType
|
None
=
None
tok_pooling_type
:
TokenPoolingType
|
None
=
None
attn_type
:
AttnTypeStr
|
None
=
None
is_prefix_caching_supported
:
bool
|
None
=
None
is_chunked_prefill_supported
:
bool
|
None
=
None
enable_test
:
bool
=
True
enable_test
:
bool
=
True
...
@@ -386,29 +395,10 @@ class EmbedModelInfo(ModelInfo):
...
@@ -386,29 +395,10 @@ class EmbedModelInfo(ModelInfo):
matryoshka_dimensions
:
list
[
int
]
|
None
=
None
matryoshka_dimensions
:
list
[
int
]
|
None
=
None
@
dataclass
class
CLSPoolingEmbedModelInfo
(
EmbedModelInfo
):
default_pooling_type
:
str
=
"CLS"
@
dataclass
class
LASTPoolingEmbedModelInfo
(
EmbedModelInfo
):
default_pooling_type
:
str
=
"LAST"
@
dataclass
@
dataclass
class
RerankModelInfo
(
ModelInfo
):
class
RerankModelInfo
(
ModelInfo
):
mteb_score
:
float
|
None
=
None
mteb_score
:
float
|
None
=
None
chat_template_name
:
str
|
None
=
None
@
dataclass
class
CLSPoolingRerankModelInfo
(
RerankModelInfo
):
default_pooling_type
:
str
=
"CLS"
@
dataclass
class
LASTPoolingRerankModelInfo
(
RerankModelInfo
):
default_pooling_type
:
str
=
"LAST"
@
dataclass
@
dataclass
...
@@ -483,12 +473,16 @@ def dummy_hf_overrides(
...
@@ -483,12 +473,16 @@ def dummy_hf_overrides(
"num_kv_shared_layers"
:
1
,
"num_kv_shared_layers"
:
1
,
}
}
_hf_config
=
hf_config
class
DummyConfig
:
class
DummyConfig
:
hf_config
=
_hf_config
hf_text_config
=
text_config
hf_text_config
=
text_config
model_arch_config
=
ModelConfig
.
get_model_arch_config
(
DummyConfig
)
# Only set MoE related config when the model has MoE layers.
# Only set MoE related config when the model has MoE layers.
# Otherwise all models detected as MoE by _get_transformers_backend_cls.
# Otherwise all models detected as MoE by _get_transformers_backend_cls.
if
M
odel
C
onfig
.
get_
num_experts
(
DummyConfig
)
>
0
:
if
m
odel
_arch_c
onfig
.
num_experts
>
0
:
update_dict
.
update
(
update_dict
.
update
(
{
{
"num_experts"
:
num_experts
,
"num_experts"
:
num_experts
,
...
...
tests/multimodal/test_audio.py
View file @
7e63ef82
...
@@ -7,10 +7,16 @@ from unittest.mock import patch
...
@@ -7,10 +7,16 @@ from unittest.mock import patch
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
torch
from
vllm.multimodal.audio
import
(
from
vllm.multimodal.audio
import
(
MONO_AUDIO_SPEC
,
PASSTHROUGH_AUDIO_SPEC
,
AudioMediaIO
,
AudioMediaIO
,
AudioResampler
,
AudioResampler
,
AudioSpec
,
ChannelReduction
,
normalize_audio
,
resample_audio_librosa
,
resample_audio_librosa
,
resample_audio_scipy
,
resample_audio_scipy
,
)
)
...
@@ -137,3 +143,500 @@ def test_audio_media_io_encode_base64(dummy_audio):
...
@@ -137,3 +143,500 @@ def test_audio_media_io_encode_base64(dummy_audio):
decoded
=
base64
.
b64decode
(
out
)
decoded
=
base64
.
b64decode
(
out
)
assert
decoded
==
b
"dummy_wav_data"
assert
decoded
==
b
"dummy_wav_data"
mock_write
.
assert_called_once
()
mock_write
.
assert_called_once
()
# ============================================================
# Tests for normalize_audio function
# ============================================================
class
TestNormalizeAudio
:
"""Tests for normalize_audio function with different specs."""
def
test_passthrough_preserves_audio
(
self
):
"""Passthrough spec should not modify audio."""
stereo
=
np
.
array
([[
1.0
,
2.0
,
3.0
],
[
4.0
,
5.0
,
6.0
]],
dtype
=
np
.
float32
)
result
=
normalize_audio
(
stereo
,
PASSTHROUGH_AUDIO_SPEC
)
np
.
testing
.
assert_array_equal
(
result
,
stereo
)
def
test_mono_spec_with_numpy_stereo
(
self
):
"""Mono spec should reduce stereo numpy array to 1D."""
stereo
=
np
.
array
([[
1.0
,
2.0
],
[
-
1.0
,
0.0
]],
dtype
=
np
.
float32
)
result
=
normalize_audio
(
stereo
,
MONO_AUDIO_SPEC
)
assert
result
.
ndim
==
1
np
.
testing
.
assert_array_almost_equal
(
result
,
[
0.0
,
1.0
])
def
test_mono_spec_with_torch_stereo
(
self
):
"""Mono spec should reduce stereo torch tensor to 1D."""
stereo
=
torch
.
tensor
([[
1.0
,
2.0
],
[
-
1.0
,
0.0
]])
result
=
normalize_audio
(
stereo
,
MONO_AUDIO_SPEC
)
assert
result
.
ndim
==
1
torch
.
testing
.
assert_close
(
result
,
torch
.
tensor
([
0.0
,
1.0
]))
def
test_mono_passthrough_for_1d_numpy
(
self
):
"""1D numpy array should pass through unchanged with mono spec."""
mono
=
np
.
array
([
1.0
,
2.0
,
3.0
],
dtype
=
np
.
float32
)
result
=
normalize_audio
(
mono
,
MONO_AUDIO_SPEC
)
assert
result
.
ndim
==
1
np
.
testing
.
assert_array_equal
(
result
,
mono
)
def
test_mono_passthrough_for_1d_torch
(
self
):
"""1D torch tensor should pass through unchanged with mono spec."""
mono
=
torch
.
tensor
([
1.0
,
2.0
,
3.0
])
result
=
normalize_audio
(
mono
,
MONO_AUDIO_SPEC
)
assert
result
.
ndim
==
1
torch
.
testing
.
assert_close
(
result
,
mono
)
def
test_first_channel_reduction
(
self
):
"""FIRST reduction should take only the first channel."""
spec
=
AudioSpec
(
target_channels
=
1
,
channel_reduction
=
ChannelReduction
.
FIRST
)
stereo
=
np
.
array
([[
1.0
,
2.0
],
[
3.0
,
4.0
]],
dtype
=
np
.
float32
)
result
=
normalize_audio
(
stereo
,
spec
)
np
.
testing
.
assert_array_equal
(
result
,
[
1.0
,
2.0
])
def
test_max_channel_reduction
(
self
):
"""MAX reduction should take max across channels."""
spec
=
AudioSpec
(
target_channels
=
1
,
channel_reduction
=
ChannelReduction
.
MAX
)
stereo
=
np
.
array
([[
1.0
,
4.0
],
[
3.0
,
2.0
]],
dtype
=
np
.
float32
)
result
=
normalize_audio
(
stereo
,
spec
)
np
.
testing
.
assert_array_equal
(
result
,
[
3.0
,
4.0
])
def
test_sum_channel_reduction
(
self
):
"""SUM reduction should sum across channels."""
spec
=
AudioSpec
(
target_channels
=
1
,
channel_reduction
=
ChannelReduction
.
SUM
)
stereo
=
np
.
array
([[
1.0
,
2.0
],
[
3.0
,
4.0
]],
dtype
=
np
.
float32
)
result
=
normalize_audio
(
stereo
,
spec
)
np
.
testing
.
assert_array_equal
(
result
,
[
4.0
,
6.0
])
def
test_invalid_3d_array_raises
(
self
):
"""3D arrays should raise ValueError."""
audio_3d
=
np
.
random
.
randn
(
2
,
3
,
4
).
astype
(
np
.
float32
)
with
pytest
.
raises
(
ValueError
,
match
=
"Unsupported audio"
):
normalize_audio
(
audio_3d
,
MONO_AUDIO_SPEC
)
def
test_channel_expansion_raises
(
self
):
"""Expanding from mono to stereo should raise ValueError."""
mono
=
np
.
array
([
1.0
,
2.0
,
3.0
],
dtype
=
np
.
float32
)
spec
=
AudioSpec
(
target_channels
=
2
)
with
pytest
.
raises
(
ValueError
,
match
=
"Cannot expand"
):
normalize_audio
(
mono
,
spec
)
def
test_time_channels_format_numpy
(
self
):
"""Audio in (time, channels) format should be transposed to (channels, time).
This handles the case where audio loaders like soundfile return
(time, channels) format instead of (channels, time) like torchaudio.
"""
# Create audio in (time, channels) format: 1000 samples, 2 channels
audio_time_channels
=
np
.
array
(
[[
1.0
,
-
1.0
]]
*
1000
,
# 1000 time steps, 2 channels
dtype
=
np
.
float32
,
)
assert
audio_time_channels
.
shape
==
(
1000
,
2
)
# (time, channels)
result
=
normalize_audio
(
audio_time_channels
,
MONO_AUDIO_SPEC
)
# Should be reduced to mono 1D
assert
result
.
ndim
==
1
assert
result
.
shape
==
(
1000
,)
# Mean of [1.0, -1.0] at each time step should be 0.0
np
.
testing
.
assert_array_almost_equal
(
result
,
np
.
zeros
(
1000
))
def
test_time_channels_format_torch
(
self
):
"""Torch tensor in (time, channels) format should be transposed."""
# Create audio in (time, channels) format: 1000 samples, 2 channels
audio_time_channels
=
torch
.
tensor
(
[[
1.0
,
-
1.0
]]
*
1000
,
# 1000 time steps, 2 channels
)
assert
audio_time_channels
.
shape
==
(
1000
,
2
)
# (time, channels)
result
=
normalize_audio
(
audio_time_channels
,
MONO_AUDIO_SPEC
)
# Should be reduced to mono 1D
assert
result
.
ndim
==
1
assert
result
.
shape
==
(
1000
,)
# Mean of [1.0, -1.0] at each time step should be 0.0
torch
.
testing
.
assert_close
(
result
,
torch
.
zeros
(
1000
))
def
test_channels_time_format_preserved
(
self
):
"""Audio already in (channels, time) format should work correctly."""
# Create audio in standard (channels, time) format: 2 channels, 1000 samples
audio_channels_time
=
np
.
array
(
[[
1.0
]
*
1000
,
[
-
1.0
]
*
1000
],
# 2 channels, 1000 time steps
dtype
=
np
.
float32
,
)
assert
audio_channels_time
.
shape
==
(
2
,
1000
)
# (channels, time)
result
=
normalize_audio
(
audio_channels_time
,
MONO_AUDIO_SPEC
)
# Should be reduced to mono 1D
assert
result
.
ndim
==
1
assert
result
.
shape
==
(
1000
,)
# Mean of [1.0, -1.0] at each time step should be 0.0
np
.
testing
.
assert_array_almost_equal
(
result
,
np
.
zeros
(
1000
))
def
test_ambiguous_square_audio_numpy
(
self
):
"""Square audio arrays (N, N) should use shape[0] > shape[1] heuristic.
For a square array, shape[0] == shape[1], so no transpose happens
and we assume (channels, time) format.
"""
# Create square audio: 4 channels, 4 samples
audio_square
=
np
.
array
(
[
[
1.0
,
2.0
,
3.0
,
4.0
],
[
5.0
,
6.0
,
7.0
,
8.0
],
[
9.0
,
10.0
,
11.0
,
12.0
],
[
13.0
,
14.0
,
15.0
,
16.0
],
],
dtype
=
np
.
float32
,
)
assert
audio_square
.
shape
==
(
4
,
4
)
result
=
normalize_audio
(
audio_square
,
MONO_AUDIO_SPEC
)
# Should be reduced to mono 1D with mean across channels (axis 0)
assert
result
.
ndim
==
1
assert
result
.
shape
==
(
4
,)
# Mean across 4 channels: [1+5+9+13, 2+6+10+14, ...] / 4
expected
=
np
.
array
([
7.0
,
8.0
,
9.0
,
10.0
])
np
.
testing
.
assert_array_almost_equal
(
result
,
expected
)
# ============================================================
# Tests for MultiModalDataParser integration with target_channels
# ============================================================
class
TestMultiModalDataParserChannelNormalization
:
"""Tests for MultiModalDataParser.target_channels integration.
These tests verify that the target_channels parameter is properly used
in the _parse_audio_data method to normalize audio channels.
"""
def
test_parser_normalizes_stereo_to_mono
(
self
):
"""Parser should normalize stereo to mono when target_channels=1."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Create parser with mono normalization enabled
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
1
,
)
# Create stereo audio (simulating torchaudio output)
stereo_audio
=
np
.
array
(
[[
1.0
,
1.0
,
1.0
],
[
-
1.0
,
-
1.0
,
-
1.0
]],
# 2 channels, 3 samples
dtype
=
np
.
float32
,
)
# Parse audio data
result
=
parser
.
_parse_audio_data
((
stereo_audio
,
16000
))
# Check that result is mono (1D)
audio_item
=
result
.
get
(
0
)
assert
audio_item
.
ndim
==
1
,
f
"Expected 1D mono audio, got
{
audio_item
.
ndim
}
D"
assert
audio_item
.
shape
==
(
3
,),
f
"Expected shape (3,), got
{
audio_item
.
shape
}
"
# Channel average of [1, 1, 1] and [-1, -1, -1] should be [0, 0, 0]
np
.
testing
.
assert_array_almost_equal
(
audio_item
,
np
.
zeros
(
3
))
def
test_parser_preserves_stereo_when_target_channels_none
(
self
):
"""Parser should preserve stereo when target_channels=None."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Create parser without channel normalization
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
None
,
)
# Create stereo audio
stereo_audio
=
np
.
array
(
[[
1.0
,
1.0
,
1.0
],
[
-
1.0
,
-
1.0
,
-
1.0
]],
dtype
=
np
.
float32
,
)
# Parse audio data
result
=
parser
.
_parse_audio_data
((
stereo_audio
,
16000
))
# Check that result preserves original shape (after resampling)
audio_item
=
result
.
get
(
0
)
# When target_channels=None, stereo audio should be preserved
assert
audio_item
.
ndim
==
2
,
f
"Expected 2D stereo audio, got
{
audio_item
.
ndim
}
D"
def
test_parser_mono_passthrough_when_target_channels_1
(
self
):
"""Parser should pass through mono audio unchanged when target_channels=1."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Create parser with mono normalization enabled
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
1
,
)
# Create mono audio (already 1D)
mono_audio
=
np
.
random
.
randn
(
16000
).
astype
(
np
.
float32
)
# Parse audio data
result
=
parser
.
_parse_audio_data
((
mono_audio
,
16000
))
# Check that result is still mono (1D)
audio_item
=
result
.
get
(
0
)
assert
audio_item
.
ndim
==
1
assert
audio_item
.
shape
==
(
16000
,)
def
test_parser_with_target_channels_2
(
self
):
"""Parser should reduce 6-channel to 2-channel when target_channels=2."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Create parser with stereo target
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
2
,
)
# Create 6-channel audio (5.1 surround)
surround_audio
=
np
.
random
.
randn
(
6
,
1000
).
astype
(
np
.
float32
)
# Parse audio data
result
=
parser
.
_parse_audio_data
((
surround_audio
,
16000
))
# Check that result is stereo (2 channels)
audio_item
=
result
.
get
(
0
)
assert
audio_item
.
ndim
==
2
assert
audio_item
.
shape
[
0
]
==
2
# 2 channels
# ============================================================
# End-to-End Audio Pipeline Tests
# ============================================================
class
TestAudioPipelineE2E
:
"""End-to-end tests for audio normalization in the full pipeline.
These tests verify the complete flow from raw audio input through
the MultiModalDataParser, simulating different audio loader formats.
"""
def
test_stereo_audio_normalized_to_mono_e2e
(
self
):
"""Full pipeline: stereo audio (torchaudio format) → mono output."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Simulate torchaudio output: (channels, time) format
# Stereo audio with left channel = 1.0, right channel = -1.0
stereo_torchaudio
=
np
.
array
(
[[
1.0
]
*
16000
,
[
-
1.0
]
*
16000
],
# 2 channels, 1 second at 16kHz
dtype
=
np
.
float32
,
)
assert
stereo_torchaudio
.
shape
==
(
2
,
16000
)
# Create parser with mono normalization (like Whisper models)
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
1
,
)
# Process audio through the parser
result
=
parser
.
_parse_audio_data
((
stereo_torchaudio
,
16000
))
audio_output
=
result
.
get
(
0
)
# Verify output is mono 1D
assert
audio_output
.
ndim
==
1
,
f
"Expected 1D, got
{
audio_output
.
ndim
}
D"
assert
audio_output
.
shape
==
(
16000
,)
# Verify channel averaging: mean of [1.0, -1.0] = 0.0
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
np
.
zeros
(
16000
),
decimal
=
5
)
def
test_soundfile_format_normalized_to_mono_e2e
(
self
):
"""Full pipeline: soundfile format (time, channels) → mono output."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Simulate soundfile output: (time, channels) format
# 16000 samples, 2 channels
stereo_soundfile
=
np
.
array
(
[[
0.5
,
-
0.5
]]
*
16000
,
# Each row is [left, right]
dtype
=
np
.
float32
,
)
assert
stereo_soundfile
.
shape
==
(
16000
,
2
)
# Create parser with mono normalization
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
1
,
)
# Process audio through the parser
result
=
parser
.
_parse_audio_data
((
stereo_soundfile
,
16000
))
audio_output
=
result
.
get
(
0
)
# Verify output is mono 1D
assert
audio_output
.
ndim
==
1
,
f
"Expected 1D, got
{
audio_output
.
ndim
}
D"
assert
audio_output
.
shape
==
(
16000
,)
# Verify channel averaging: mean of [0.5, -0.5] = 0.0
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
np
.
zeros
(
16000
),
decimal
=
5
)
def
test_librosa_mono_passthrough_e2e
(
self
):
"""Full pipeline: librosa mono format → preserved as mono."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Simulate librosa output: already mono (time,) format
mono_librosa
=
np
.
random
.
randn
(
16000
).
astype
(
np
.
float32
)
assert
mono_librosa
.
shape
==
(
16000
,)
# Create parser with mono normalization
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
1
,
)
# Process audio through the parser
result
=
parser
.
_parse_audio_data
((
mono_librosa
,
16000
))
audio_output
=
result
.
get
(
0
)
# Verify output is still mono 1D
assert
audio_output
.
ndim
==
1
assert
audio_output
.
shape
==
(
16000
,)
# Verify audio content is preserved
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
mono_librosa
)
def
test_multichannel_5_1_surround_to_mono_e2e
(
self
):
"""Full pipeline: 5.1 surround (6 channels) → mono output."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Simulate 5.1 surround audio: 6 channels
surround_audio
=
np
.
array
(
[
[
1.0
]
*
8000
,
# Front Left
[
2.0
]
*
8000
,
# Front Right
[
3.0
]
*
8000
,
# Center
[
4.0
]
*
8000
,
# LFE (subwoofer)
[
5.0
]
*
8000
,
# Rear Left
[
6.0
]
*
8000
,
# Rear Right
],
dtype
=
np
.
float32
,
)
assert
surround_audio
.
shape
==
(
6
,
8000
)
# Create parser with mono normalization
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
1
,
)
# Process audio through the parser
result
=
parser
.
_parse_audio_data
((
surround_audio
,
16000
))
audio_output
=
result
.
get
(
0
)
# Verify output is mono 1D
assert
audio_output
.
ndim
==
1
# Verify channel averaging: mean of [1,2,3,4,5,6] = 3.5
expected_value
=
(
1.0
+
2.0
+
3.0
+
4.0
+
5.0
+
6.0
)
/
6
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
np
.
full
(
8000
,
expected_value
),
decimal
=
5
)
def
test_torch_tensor_input_e2e
(
self
):
"""Full pipeline: torch.Tensor stereo input → mono numpy output."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Simulate torch tensor input (from torchaudio)
stereo_torch
=
torch
.
tensor
(
[[
1.0
]
*
8000
,
[
-
1.0
]
*
8000
],
# 2 channels
dtype
=
torch
.
float32
,
)
assert
stereo_torch
.
shape
==
(
2
,
8000
)
# Create parser with mono normalization
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
1
,
)
# Process audio through the parser
# Note: Parser expects numpy, so we convert first (simulating real usage)
result
=
parser
.
_parse_audio_data
((
stereo_torch
.
numpy
(),
16000
))
audio_output
=
result
.
get
(
0
)
# Verify output is mono 1D numpy array
assert
audio_output
.
ndim
==
1
assert
isinstance
(
audio_output
,
np
.
ndarray
)
# Verify channel averaging
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
np
.
zeros
(
8000
),
decimal
=
5
)
def
test_passthrough_preserves_stereo_e2e
(
self
):
"""Full pipeline: stereo with target_channels=None → stereo preserved."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Stereo audio
stereo_audio
=
np
.
array
(
[[
1.0
]
*
8000
,
[
-
1.0
]
*
8000
],
dtype
=
np
.
float32
,
)
# Create parser WITHOUT mono normalization (passthrough)
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
None
,
# Passthrough - no normalization
)
# Process audio through the parser
result
=
parser
.
_parse_audio_data
((
stereo_audio
,
16000
))
audio_output
=
result
.
get
(
0
)
# Verify output preserves stereo (2D)
assert
audio_output
.
ndim
==
2
assert
audio_output
.
shape
==
(
2
,
8000
)
def
test_resampling_with_channel_normalization_e2e
(
self
):
"""Full pipeline: resample + channel normalize in single pass."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Stereo audio at 48kHz (common recording rate)
stereo_48k
=
np
.
array
(
[[
1.0
]
*
48000
,
[
-
1.0
]
*
48000
],
# 1 second at 48kHz
dtype
=
np
.
float32
,
)
# Create parser with both resampling and mono normalization
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
# Resample to 16kHz
target_channels
=
1
,
# Normalize to mono
)
# Process audio through the parser
result
=
parser
.
_parse_audio_data
((
stereo_48k
,
48000
))
audio_output
=
result
.
get
(
0
)
# Verify output is mono 1D at target sample rate
assert
audio_output
.
ndim
==
1
# After resampling from 48kHz to 16kHz, length should be ~16000
assert
audio_output
.
shape
[
0
]
==
16000
def
test_very_short_audio_e2e
(
self
):
"""Full pipeline: very short audio (< 1 frame) handled correctly."""
from
vllm.multimodal.parse
import
MultiModalDataParser
# Very short stereo audio (10 samples)
short_stereo
=
np
.
array
(
[[
1.0
]
*
10
,
[
-
1.0
]
*
10
],
dtype
=
np
.
float32
,
)
parser
=
MultiModalDataParser
(
target_sr
=
16000
,
target_channels
=
1
,
)
result
=
parser
.
_parse_audio_data
((
short_stereo
,
16000
))
audio_output
=
result
.
get
(
0
)
# Should still produce mono output
assert
audio_output
.
ndim
==
1
assert
audio_output
.
shape
==
(
10
,)
np
.
testing
.
assert_array_almost_equal
(
audio_output
,
np
.
zeros
(
10
))
Prev
1
…
24
25
26
27
28
29
30
31
32
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment