Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
711aa9d5
Commit
711aa9d5
authored
Jul 30, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.0' into v0.10.0-dev
parents
751c492c
6d8d0a24
Changes
519
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1536 additions
and
130 deletions
+1536
-130
tests/models/language/pooling/test_truncation_control.py
tests/models/language/pooling/test_truncation_control.py
+3
-3
tests/models/multimodal/generation/test_common.py
tests/models/multimodal/generation/test_common.py
+70
-2
tests/models/multimodal/generation/test_maverick.py
tests/models/multimodal/generation/test_maverick.py
+652
-0
tests/models/multimodal/generation/test_pixtral.py
tests/models/multimodal/generation/test_pixtral.py
+2
-3
tests/models/multimodal/generation/test_voxtral.py
tests/models/multimodal/generation/test_voxtral.py
+115
-0
tests/models/multimodal/generation/test_whisper.py
tests/models/multimodal/generation/test_whisper.py
+1
-1
tests/models/multimodal/generation/vlm_utils/core.py
tests/models/multimodal/generation/vlm_utils/core.py
+1
-1
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+1
-1
tests/models/multimodal/pooling/test_jinavl_reranker.py
tests/models/multimodal/pooling/test_jinavl_reranker.py
+187
-0
tests/models/multimodal/pooling/test_prithvi_mae.py
tests/models/multimodal/pooling/test_prithvi_mae.py
+63
-0
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+4
-2
tests/models/multimodal/processing/test_nemotron_vl.py
tests/models/multimodal/processing/test_nemotron_vl.py
+134
-0
tests/models/multimodal/processing/test_transformers.py
tests/models/multimodal/processing/test_transformers.py
+40
-0
tests/models/quantization/test_bitsandbytes.py
tests/models/quantization/test_bitsandbytes.py
+50
-23
tests/models/quantization/test_modelopt.py
tests/models/quantization/test_modelopt.py
+3
-3
tests/models/quantization/test_nvfp4.py
tests/models/quantization/test_nvfp4.py
+3
-3
tests/models/registry.py
tests/models/registry.py
+112
-70
tests/models/test_initialization.py
tests/models/test_initialization.py
+47
-9
tests/models/test_registry.py
tests/models/test_registry.py
+9
-5
tests/models/test_transformers.py
tests/models/test_transformers.py
+39
-4
No files found.
Too many changes to show.
To preserve performance only
519 of 519+
files are displayed.
Plain diff
Email patch
tests/models/language/pooling/test_truncation_control.py
View file @
711aa9d5
...
...
@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
with
vllm_runner
(
model_name
,
task
=
"embed"
,
max_model_len
=
max_model_len
)
as
vllm_model
:
vllm_output
=
vllm_model
.
model
.
encode
(
vllm_output
=
vllm_model
.
llm
.
encode
(
input_str
,
truncate_prompt_tokens
=
truncate_prompt_tokens
)
prompt_tokens
=
vllm_output
[
0
].
prompt_token_ids
...
...
@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
with
vllm_runner
(
model_name
,
task
=
"embed"
,
max_model_len
=
max_model_len
)
as
vllm_model
:
vllm_output
=
vllm_model
.
model
.
encode
(
vllm_output
=
vllm_model
.
llm
.
encode
(
input_str
,
truncate_prompt_tokens
=
truncate_prompt_tokens
)
prompt_tokens
=
vllm_output
[
0
].
prompt_token_ids
...
...
@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
model_name
,
task
=
"embed"
,
max_model_len
=
max_model_len
)
as
vllm_model
:
llm_output
=
vllm_model
.
model
.
encode
(
llm_output
=
vllm_model
.
llm
.
encode
(
input_str
,
truncate_prompt_tokens
=
truncate_prompt_tokens
)
assert
llm_output
==
f
"""truncate_prompt_tokens value
...
...
tests/models/multimodal/generation/test_common.py
View file @
711aa9d5
...
...
@@ -37,6 +37,8 @@ if current_platform.is_rocm():
REQUIRES_V0_MODELS
=
[
# V1 Test: not enough KV cache space in C1.
"fuyu"
,
# V1 Test: Deadlock issue when processing mm_inputs
"llava-onevision-transformers"
,
]
# yapf: disable
...
...
@@ -155,6 +157,7 @@ VLM_TEST_SETTINGS = {
video_idx_to_prompt
=
lambda
idx
:
"<|vision_bos|><|VIDEO|><|vision_eos|>"
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
num_logprobs
=
6
if
current_platform
.
is_cpu
()
else
5
,
auto_cls
=
AutoModelForTextToWaveform
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
patch_hf_runner
=
model_utils
.
qwen2_5_omni_patch_hf_runner
,
...
...
@@ -172,6 +175,71 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc
=
model_utils
.
ultravox_trunc_hf_output
,
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
#### Transformers fallback to test
## To reduce test burden, we only test batching arbitrary image size
# Dynamic image length and number of patches
"llava-onevision-transformers"
:
VLMTestInfo
(
models
=
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
max_model_len
=
16384
,
hf_model_kwargs
=
model_utils
.
llava_onevision_hf_model_kwargs
(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
),
# noqa: E501
auto_cls
=
AutoModelForImageTextToText
,
vllm_output_post_proc
=
model_utils
.
llava_onevision_vllm_to_hf_output
,
image_size_factors
=
[(
0.25
,
0.5
,
1.0
)],
vllm_runner_kwargs
=
{
"model_impl"
:
"transformers"
,
},
marks
=
[
pytest
.
mark
.
core_model
],
),
# FIXME(Isotr0py): Enable this test after
# https://github.com/huggingface/transformers/pull/39470 released
# "idefics3-transformers": VLMTestInfo(
# models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
# prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
# img_idx_to_prompt=lambda idx: "<image>",
# max_model_len=8192,
# max_num_seqs=2,
# auto_cls=AutoModelForImageTextToText,
# hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
# image_size_factors=[(0.25, 0.5, 1.0)],
# vllm_runner_kwargs={
# "model_impl": "transformers",
# },
# marks=[pytest.mark.core_model],
# ),
# Pixel values from processor are not 4D or 5D arrays
"qwen2_5_vl-transformers"
:
VLMTestInfo
(
models
=
[
"Qwen/Qwen2.5-VL-3B-Instruct"
],
test_type
=
VLMTestType
.
IMAGE
,
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<|vision_start|><|image_pad|><|vision_end|>"
,
# noqa: E501
max_model_len
=
4096
,
max_num_seqs
=
2
,
auto_cls
=
AutoModelForImageTextToText
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
image_size_factors
=
[(
0.25
,
0.2
,
0.15
)],
vllm_runner_kwargs
=
{
"model_impl"
:
"transformers"
,
},
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
# Check "auto" with fallback to transformers
"internvl-transformers"
:
VLMTestInfo
(
models
=
[
"OpenGVLab/InternVL3-1B-hf"
],
test_type
=
(
VLMTestType
.
IMAGE
,
VLMTestType
.
MULTI_IMAGE
),
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>Assistant
\n
"
,
# noqa: E501
img_idx_to_prompt
=
lambda
idx
:
"<IMG_CONTEXT>"
,
max_model_len
=
4096
,
use_tokenizer_eos
=
True
,
image_size_factors
=
[(
0.25
,
0.5
,
1.0
)],
vllm_runner_kwargs
=
{
"model_impl"
:
"auto"
,
},
auto_cls
=
AutoModelForImageTextToText
,
marks
=
[
pytest
.
mark
.
core_model
],
),
#### Extended model tests
"aria"
:
VLMTestInfo
(
models
=
[
os
.
path
.
join
(
models_path_prefix
,
"rhymes-ai/Aria"
)],
...
...
@@ -320,6 +388,7 @@ VLM_TEST_SETTINGS = {
num_logprobs
=
10
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
auto_cls
=
AutoModelForImageTextToText
,
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
"glm4_1v-video"
:
VLMTestInfo
(
models
=
[
"THUDM/GLM-4.1V-9B-Thinking"
],
...
...
@@ -333,8 +402,7 @@ VLM_TEST_SETTINGS = {
inputs
=
custom_inputs
.
video_with_metadata_glm4_1v
(),
limit_mm_per_prompt
=
{
"video"
:
1
},
)],
# This is needed to run on machine with 24GB VRAM
vllm_runner_kwargs
=
{
"gpu_memory_utilization"
:
0.95
},
marks
=
[
large_gpu_mark
(
min_gb
=
32
)],
),
"h2ovl"
:
VLMTestInfo
(
models
=
[
...
...
tests/models/multimodal/generation/test_maverick.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Create a reduced-layer version of the Maverick model for testing purposes.
This script creates a new model with fewer layers by:
1. Loading the original Maverick model configuration
2. Creating a reduced configuration
3. Generating compatible safetensors files with appropriate weights
4. Creating the necessary index files for vLLM compatibility
"""
import
json
import
shutil
from
pathlib
import
Path
from
typing
import
Any
import
pytest
import
torch
from
safetensors.torch
import
save_file
from
transformers
import
(
AutoConfig
,
AutoProcessor
,
AutoTokenizer
,
GenerationConfig
)
from
vllm
import
LLM
,
SamplingParams
from
....utils
import
multi_gpu_test
# Sample prompts for testing
PROMPTS
:
list
[
str
]
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
def
run_maverick_serving
(
model
:
str
):
"""Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
options with reduced layers.
"""
try
:
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
llm
=
LLM
(
model
=
model
,
max_model_len
=
2048
,
enforce_eager
=
True
,
tensor_parallel_size
=
8
,
enable_expert_parallel
=
True
,
trust_remote_code
=
True
,
gpu_memory_utilization
=
0.4
,
kv_cache_dtype
=
"fp8"
,
)
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
)
# Print the outputs
print
(
"
\n
Generated Outputs:
\n
"
+
"-"
*
60
)
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
prompt
!
r
}
"
)
print
(
f
"Output:
{
generated_text
!
r
}
"
)
print
(
"-"
*
60
)
except
Exception
as
e
:
print
(
f
"Error initializing or running model:
{
e
}
"
)
raise
def
create_reduced_maverick_model
(
original_model_name
:
str
=
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
output_dir
:
str
=
"/tmp/reduced_maverick"
,
text_layers
:
int
=
4
,
num_experts
:
int
=
4
,
vision_layers
:
int
=
2
,
force_recreate
:
bool
=
False
,
)
->
str
:
"""
Create a reduced-layer version of the Maverick model.
Args:
original_model_name: Name of the original Maverick model
output_dir: Directory to save the reduced model
text_layers: Number of text transformer layers
num_experts: Number of experts per layer
vision_layers: Number of vision transformer layers
force_recreate: Whether to recreate if output_dir already exists
Returns:
Path to the created reduced model directory
"""
print
(
f
"Creating reduced Maverick model with
{
text_layers
}
text layers and "
f
"
{
vision_layers
}
vision layers..."
)
# Create output directory
output_path
=
Path
(
output_dir
)
if
output_path
.
exists
():
if
force_recreate
:
shutil
.
rmtree
(
output_path
)
else
:
print
(
f
"Output directory
{
output_dir
}
already exists. "
"Use --force-recreate to overwrite."
)
return
str
(
output_path
)
output_path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
try
:
print
(
"Loading original model configuration..."
)
original_config
=
AutoConfig
.
from_pretrained
(
original_model_name
,
trust_remote_code
=
True
)
print
(
"Creating reduced configuration..."
)
reduced_config
=
create_reduced_config
(
original_config
,
text_layers
,
num_experts
,
vision_layers
)
config_path
=
output_path
/
"config.json"
with
open
(
config_path
,
"w"
)
as
f
:
json
.
dump
(
reduced_config
,
f
,
indent
=
2
)
print
(
f
"Saved reduced config to
{
config_path
}
"
)
print
(
"Copying tokenizer files..."
)
copy_tokenizer_files
(
original_model_name
,
output_path
)
print
(
"Creating reduced safetensors files..."
)
create_reduced_safetensors
(
original_config
,
reduced_config
,
output_path
)
print
(
"Creating preprocessor config..."
)
create_preprocessor_config
(
original_config
,
output_path
)
try
:
gen_config
=
GenerationConfig
.
from_pretrained
(
original_model_name
)
gen_config
.
save_pretrained
(
output_path
)
print
(
"Copied generation config"
)
except
Exception
as
e
:
print
(
f
"Could not copy generation config:
{
e
}
"
)
print
(
f
"Successfully created reduced Maverick model at
{
output_path
}
"
)
return
str
(
output_path
)
except
Exception
as
e
:
print
(
f
"Error creating reduced model:
{
e
}
"
)
# Clean up on failure
if
output_path
.
exists
():
shutil
.
rmtree
(
output_path
)
raise
def
create_reduced_config
(
original_config
:
Any
,
text_layers
:
int
,
num_experts
:
int
,
vision_layers
:
int
)
->
dict
[
str
,
Any
]:
"""Create a reduced configuration based on the original."""
# Convert config to dictionary
config_dict
=
original_config
.
to_dict
()
# Reduce text layers
if
"text_config"
in
config_dict
:
original_text_layers
=
config_dict
[
"text_config"
][
"num_hidden_layers"
]
config_dict
[
"text_config"
][
"num_hidden_layers"
]
=
text_layers
print
(
f
"Reduced text layers from
{
original_text_layers
}
to
{
text_layers
}
"
)
original_num_experts
=
config_dict
[
"text_config"
][
"num_local_experts"
]
config_dict
[
"text_config"
][
"num_local_experts"
]
=
num_experts
print
(
f
"Reduced num experts from
{
original_num_experts
}
to
{
num_experts
}
"
)
hidden_dim_divisor
=
4
original_hidden_size
=
config_dict
[
"text_config"
][
"hidden_size"
]
new_hidden_size
=
original_hidden_size
//
hidden_dim_divisor
config_dict
[
"text_config"
][
"hidden_size"
]
=
new_hidden_size
print
(
f
"Reduced hidden size from
{
original_hidden_size
}
to "
f
"
{
new_hidden_size
}
"
)
original_head_dim
=
config_dict
[
"text_config"
][
"head_dim"
]
new_head_dim
=
original_head_dim
//
hidden_dim_divisor
config_dict
[
"text_config"
][
"head_dim"
]
=
new_head_dim
print
(
f
"Reduced head dim from
{
original_head_dim
}
to
{
new_head_dim
}
"
)
# Reduce vision layers
if
"vision_config"
in
config_dict
:
original_vision_layers
=
config_dict
[
"vision_config"
][
"num_hidden_layers"
]
config_dict
[
"vision_config"
][
"num_hidden_layers"
]
=
vision_layers
print
(
f
"Reduced vision layers from
{
original_vision_layers
}
"
f
"to
{
vision_layers
}
"
)
# Update model name to indicate it's a reduced version
config_dict
[
"_name_or_path"
]
=
(
f
"reduced_maverick_
{
text_layers
}
t_
{
vision_layers
}
v"
)
return
config_dict
def
copy_tokenizer_files
(
original_model_name
:
str
,
output_path
:
Path
)
->
None
:
"""Copy tokenizer files from the original model."""
try
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
original_model_name
,
trust_remote_code
=
True
)
tokenizer
.
save_pretrained
(
output_path
)
print
(
"Tokenizer files copied successfully"
)
except
Exception
as
e
:
print
(
f
"Warning: Could not copy tokenizer files:
{
e
}
"
)
def
create_preprocessor_config
(
original_config
:
Any
,
output_path
:
Path
)
->
None
:
"""Create preprocessor_config.json for multimodal model."""
# Try to load the original preprocessor config
try
:
processor
=
AutoProcessor
.
from_pretrained
(
original_config
.
_name_or_path
or
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
trust_remote_code
=
True
,
)
processor
.
save_pretrained
(
output_path
)
print
(
"Copied original preprocessor config"
)
return
except
Exception
as
e
:
print
(
f
"Could not copy original preprocessor config:
{
e
}
"
)
raise
def
create_reduced_safetensors
(
original_config
:
Any
,
reduced_config
:
dict
[
str
,
Any
],
output_path
:
Path
)
->
None
:
"""Create safetensors files with weights for the reduced model."""
print
(
"Generating synthetic weights for reduced model..."
)
text_config
=
reduced_config
[
"text_config"
]
vision_config
=
reduced_config
[
"vision_config"
]
weights
=
{}
print
(
"Creating text model weights..."
)
weights
.
update
(
create_text_model_weights
(
text_config
))
print
(
"Creating vision model weights..."
)
weights
.
update
(
create_vision_model_weights
(
vision_config
))
print
(
"Creating shared model weights..."
)
weights
.
update
(
create_shared_weights
(
text_config
,
vision_config
))
print
(
"Saving weights to safetensors files..."
)
save_weights_to_safetensors
(
weights
,
output_path
)
def
create_text_model_weights
(
text_config
:
dict
[
str
,
Any
])
->
dict
[
str
,
torch
.
Tensor
]:
"""Create synthetic weights for the text model with MoE structure."""
weights
=
{}
vocab_size
=
text_config
[
"vocab_size"
]
hidden_size
=
text_config
[
"hidden_size"
]
intermediate_size
=
text_config
[
"intermediate_size"
]
intermediate_size_mlp
=
text_config
[
"intermediate_size_mlp"
]
num_layers
=
text_config
[
"num_hidden_layers"
]
num_attention_heads
=
text_config
[
"num_attention_heads"
]
num_key_value_heads
=
text_config
.
get
(
"num_key_value_heads"
,
num_attention_heads
)
# MoE specific parameters
num_experts
=
text_config
.
get
(
"num_local_experts"
)
assert
(
num_experts
is
not
None
),
"num_local_experts must be specified for MoE"
head_dim
=
hidden_size
//
num_attention_heads
# Embedding layers
weights
[
"language_model.model.embed_tokens.weight"
]
=
torch
.
randn
(
vocab_size
,
hidden_size
,
dtype
=
torch
.
float16
)
# Transformer layers
for
layer_idx
in
range
(
num_layers
):
layer_prefix
=
f
"language_model.model.layers.
{
layer_idx
}
"
print
(
f
"Creating weights for layer
{
layer_prefix
}
..."
)
# Self-attention weights (separate q, k, v projections)
weights
[
f
"
{
layer_prefix
}
.self_attn.q_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
num_attention_heads
*
head_dim
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.k_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
num_key_value_heads
*
head_dim
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.v_proj.weight"
]
=
torch
.
randn
(
num_key_value_heads
*
head_dim
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.o_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
num_attention_heads
*
head_dim
,
dtype
=
torch
.
bfloat16
)
print
(
"Self-attention weights created."
)
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
# 0,2,4,... are dense
interleave_step
=
text_config
.
get
(
"interleave_moe_layer_step"
,
1
)
is_moe_layer
=
(
interleave_step
>
0
and
(
layer_idx
+
1
)
%
interleave_step
==
0
)
if
is_moe_layer
:
# MoE layer structure
# 1. Router weights
weights
[
f
"
{
layer_prefix
}
.feed_forward.router.weight"
]
=
torch
.
randn
(
num_experts
,
hidden_size
,
dtype
=
torch
.
float16
)
# 2. Individual expert weights (not fused)
for
expert_idx
in
range
(
num_experts
):
expert_prefix
=
(
f
"
{
layer_prefix
}
.feed_forward.experts.
{
expert_idx
}
"
)
weights
[
f
"
{
expert_prefix
}
.gate_proj.weight"
]
=
torch
.
randn
(
intermediate_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
expert_prefix
}
.up_proj.weight"
]
=
torch
.
randn
(
intermediate_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
expert_prefix
}
.down_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
intermediate_size
,
dtype
=
torch
.
bfloat16
)
# Expert weight scales (FP8 quantization)
weights
[
f
"
{
expert_prefix
}
.gate_proj.weight_scale"
]
=
torch
.
ones
(
intermediate_size
,
1
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
expert_prefix
}
.up_proj.weight_scale"
]
=
torch
.
ones
(
intermediate_size
,
1
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
expert_prefix
}
.down_proj.weight_scale"
]
=
torch
.
ones
(
hidden_size
,
1
,
dtype
=
torch
.
bfloat16
)
# 3. Shared expert weights
shared_expert_prefix
=
f
"
{
layer_prefix
}
.feed_forward.shared_expert"
weights
[
f
"
{
shared_expert_prefix
}
.gate_proj.weight"
]
=
torch
.
randn
(
intermediate_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
shared_expert_prefix
}
.up_proj.weight"
]
=
torch
.
randn
(
intermediate_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
shared_expert_prefix
}
.down_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
intermediate_size
,
dtype
=
torch
.
bfloat16
)
print
(
f
"MoE feed-forward weights created for layer
{
layer_idx
}
."
)
else
:
# Dense layer structure
weights
[
f
"
{
layer_prefix
}
.feed_forward.gate_proj.weight"
]
=
(
torch
.
randn
(
intermediate_size_mlp
,
hidden_size
,
dtype
=
torch
.
bfloat16
))
weights
[
f
"
{
layer_prefix
}
.feed_forward.up_proj.weight"
]
=
(
torch
.
randn
(
intermediate_size_mlp
,
hidden_size
,
dtype
=
torch
.
bfloat16
))
weights
[
f
"
{
layer_prefix
}
.feed_forward.down_proj.weight"
]
=
(
torch
.
randn
(
hidden_size
,
intermediate_size_mlp
,
dtype
=
torch
.
bfloat16
))
print
(
f
"Dense feed-forward weights created for layer
{
layer_idx
}
."
)
# Layer norms
weights
[
f
"
{
layer_prefix
}
.input_layernorm.weight"
]
=
torch
.
ones
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.post_attention_layernorm.weight"
]
=
torch
.
ones
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
print
(
"Layer norms created."
)
# Final layer norm and output projection
weights
[
"language_model.model.norm.weight"
]
=
torch
.
ones
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
"language_model.lm_head.weight"
]
=
torch
.
randn
(
vocab_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
return
weights
def
create_vision_model_weights
(
vision_config
:
dict
[
str
,
Any
])
->
dict
[
str
,
torch
.
Tensor
]:
"""Create synthetic weights for the vision model."""
weights
=
{}
hidden_size
=
vision_config
[
"hidden_size"
]
intermediate_size
=
vision_config
[
"intermediate_size"
]
num_layers
=
vision_config
[
"num_hidden_layers"
]
# Vision transformer layers
for
layer_idx
in
range
(
num_layers
):
layer_prefix
=
f
"vision_model.model.layers.
{
layer_idx
}
"
weights
[
f
"
{
layer_prefix
}
.self_attn.q_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.q_proj.bias"
]
=
torch
.
zeros
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.k_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.k_proj.bias"
]
=
torch
.
zeros
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.v_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.v_proj.bias"
]
=
torch
.
zeros
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.o_proj.weight"
]
=
torch
.
randn
(
hidden_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.self_attn.o_proj.bias"
]
=
torch
.
zeros
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.mlp.fc1.weight"
]
=
torch
.
randn
(
intermediate_size
,
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.mlp.fc1.bias"
]
=
torch
.
zeros
(
intermediate_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.mlp.fc2.weight"
]
=
torch
.
randn
(
hidden_size
,
intermediate_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.mlp.fc2.bias"
]
=
torch
.
zeros
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.input_layernorm.weight"
]
=
torch
.
ones
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.input_layernorm.bias"
]
=
torch
.
zeros
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.post_attention_layernorm.weight"
]
=
torch
.
ones
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
weights
[
f
"
{
layer_prefix
}
.post_attention_layernorm.bias"
]
=
torch
.
zeros
(
hidden_size
,
dtype
=
torch
.
bfloat16
)
return
weights
def
create_shared_weights
(
text_config
:
dict
[
str
,
Any
],
vision_config
:
dict
[
str
,
Any
])
->
dict
[
str
,
torch
.
Tensor
]:
"""Create weights for shared components (vision-language connector)"""
weights
=
{}
text_hidden_size
=
text_config
[
"hidden_size"
]
projector_input_dim
=
vision_config
[
"projector_input_dim"
]
# Vision-language connector (projects vision features to text space)
weights
[
"multi_modal_projector.linear_1.weight"
]
=
torch
.
randn
(
text_hidden_size
,
projector_input_dim
,
dtype
=
torch
.
bfloat16
)
return
weights
def
save_weights_to_safetensors
(
weights
:
dict
[
str
,
torch
.
Tensor
],
output_path
:
Path
)
->
None
:
"""Save weights to safetensors files and create index."""
# Determine how to shard the weights
max_shard_size
=
5
*
1024
*
1024
*
1024
# 5GB per shard
# Calculate sizes and create shards
shards
=
[]
current_shard
:
dict
[
str
,
torch
.
Tensor
]
=
{}
current_size
=
0
for
name
,
tensor
in
weights
.
items
():
tensor_size
=
tensor
.
numel
()
*
tensor
.
element_size
()
if
current_size
+
tensor_size
>
max_shard_size
and
current_shard
:
shards
.
append
(
current_shard
)
current_shard
=
{}
current_size
=
0
current_shard
[
name
]
=
tensor
current_size
+=
tensor_size
if
current_shard
:
shards
.
append
(
current_shard
)
# Save shards and create index
weight_map
=
{}
if
len
(
shards
)
==
1
:
# Single file
filename
=
"model.safetensors"
save_file
(
shards
[
0
],
output_path
/
filename
)
weight_map
=
{
name
:
filename
for
name
in
shards
[
0
]}
print
(
f
"Saved weights to single file:
{
filename
}
"
)
else
:
# Multiple shards
for
i
,
shard
in
enumerate
(
shards
):
filename
=
f
"model-
{
i
+
1
:
05
d
}
-of-
{
len
(
shards
):
05
d
}
.safetensors"
save_file
(
shard
,
output_path
/
filename
)
for
name
in
shard
:
weight_map
[
name
]
=
filename
print
(
f
"Saved shard
{
i
+
1
}
/
{
len
(
shards
)
}
:
{
filename
}
"
)
# Create index file
index_data
=
{
"metadata"
:
{
"total_size"
:
sum
(
tensor
.
numel
()
*
tensor
.
element_size
()
for
tensor
in
weights
.
values
())
},
"weight_map"
:
weight_map
,
}
index_path
=
output_path
/
"model.safetensors.index.json"
with
open
(
index_path
,
"w"
)
as
f
:
json
.
dump
(
index_data
,
f
,
indent
=
2
)
print
(
f
"Created index file:
{
index_path
}
"
)
print
(
f
"Total model size: "
f
"
{
index_data
[
'metadata'
][
'total_size'
]
/
(
1024
**
3
):.
2
f
}
GB"
)
def
run_reduced_model
(
model_path
:
str
,
should_profile
:
bool
=
False
,
**
kwargs
)
->
None
:
"""Test the created reduced model with vLLM."""
print
(
f
"
\n
Testing reduced model at
{
model_path
}
..."
)
llm
=
LLM
(
model
=
model_path
,
trust_remote_code
=
True
,
max_model_len
=
512
,
# Small context for testing
gpu_memory_utilization
=
0.3
,
# Conservative memory usage
**
kwargs
,
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
50
)
if
should_profile
:
llm
.
start_profile
()
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
)
if
should_profile
:
llm
.
stop_profile
()
print
(
"Test generation successful!"
)
for
output
in
outputs
:
print
(
f
"Prompt:
{
output
.
prompt
}
"
)
print
(
f
"Output: "
f
"
{
output
.
outputs
[
0
].
text
}
"
)
print
(
"-"
*
40
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"original_model_name,text_layers,num_experts,vision_layers,"
,
[(
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
4
,
4
,
2
)])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"tp,ep"
,
[(
2
,
True
)])
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
(),
reason
=
"CUDA not available"
)
def
test_dummy_maverick
(
original_model_name
:
str
,
text_layers
:
int
,
num_experts
:
int
,
vision_layers
:
int
,
enforce_eager
:
bool
,
tp
:
int
,
ep
:
bool
,
output_dir
:
str
=
"/tmp/reduced_maverick"
,
force_recreate
:
bool
=
True
,
profile
:
bool
=
False
,
)
->
None
:
model_path
=
create_reduced_maverick_model
(
original_model_name
=
original_model_name
,
output_dir
=
output_dir
,
text_layers
=
text_layers
,
num_experts
=
num_experts
,
vision_layers
=
vision_layers
,
force_recreate
=
force_recreate
,
)
print
(
f
"
\n
Reduced model created successfully at:
{
model_path
}
"
)
run_reduced_model
(
model_path
=
model_path
,
should_profile
=
profile
,
enforce_eager
=
enforce_eager
,
tensor_parallel_size
=
tp
,
enable_expert_parallel
=
ep
)
def
main
():
"""Main function to create and test the reduced model."""
import
argparse
parser
=
argparse
.
ArgumentParser
(
description
=
"Create a reduced-layer Maverick model"
)
parser
.
add_argument
(
"--output-dir"
,
default
=
"/tmp/reduced_maverick"
,
help
=
"Output directory for the reduced model"
,
)
parser
.
add_argument
(
"--text-layers"
,
type
=
int
,
default
=
4
,
help
=
"Number of text transformer layers"
,
)
parser
.
add_argument
(
"--num-experts"
,
type
=
int
,
default
=
4
,
help
=
"Number of experts"
)
parser
.
add_argument
(
"--vision-layers"
,
type
=
int
,
default
=
2
,
help
=
"Number of vision transformer layers"
,
)
parser
.
add_argument
(
"--force-recreate"
,
action
=
"store_true"
,
help
=
"Force recreation if output directory exists"
,
)
parser
.
add_argument
(
"--test"
,
action
=
"store_true"
,
help
=
"Test the created model with vLLM"
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
help
=
"Profile the created model with vLLM"
)
parser
.
add_argument
(
"--test-original"
,
action
=
"store_true"
,
help
=
"Test the original model with vLLM"
,
)
parser
.
add_argument
(
"--original-model"
,
default
=
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
help
=
"Original model name to base the reduction on"
,
)
args
=
parser
.
parse_args
()
if
args
.
test
:
test_dummy_maverick
(
original_model_name
=
args
.
original_model
,
output_dir
=
args
.
output_dir
,
text_layers
=
args
.
text_layers
,
num_experts
=
args
.
num_experts
,
vision_layers
=
args
.
vision_layers
,
force_recreate
=
args
.
force_recreate
,
tp
=
2
,
ep
=
True
,
enforce_eager
=
True
,
profile
=
args
.
profile
)
if
args
.
test_original
:
run_maverick_serving
(
args
.
original_model
)
if
__name__
==
"__main__"
:
exit
(
main
())
tests/models/multimodal/generation/test_pixtral.py
View file @
711aa9d5
...
...
@@ -182,8 +182,7 @@ def test_chat(
)
as
vllm_model
:
outputs
=
[]
for
msg
in
MSGS
:
output
=
vllm_model
.
model
.
chat
(
msg
,
sampling_params
=
SAMPLING_PARAMS
)
output
=
vllm_model
.
llm
.
chat
(
msg
,
sampling_params
=
SAMPLING_PARAMS
)
outputs
.
extend
(
output
)
...
...
@@ -219,7 +218,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
max_model_len
=
8192
,
limit_mm_per_prompt
=
LIMIT_MM_PER_PROMPT
,
)
as
vllm_model
:
outputs
=
vllm_model
.
model
.
generate
(
prompt
)
outputs
=
vllm_model
.
llm
.
generate
(
prompt
)
assert
len
(
outputs
)
==
1
,
f
"
{
len
(
outputs
)
=
}
"
output
:
RequestOutput
=
outputs
[
0
]
...
...
tests/models/multimodal/generation/test_voxtral.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
pytest
import
pytest_asyncio
from
mistral_common.audio
import
Audio
from
mistral_common.protocol.instruct.messages
import
(
AudioChunk
,
RawAudio
,
TextChunk
,
UserMessage
)
from
vllm.transformers_utils.tokenizer
import
MistralTokenizer
from
....conftest
import
AudioTestAssets
from
....utils
import
RemoteOpenAIServer
from
.test_ultravox
import
MULTI_AUDIO_PROMPT
,
run_multi_audio_test
MODEL_NAME
=
"mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS
=
[
"--tokenizer_mode"
,
"mistral"
,
"--config_format"
,
"mistral"
,
"--load_format"
,
"mistral"
]
@
pytest
.
fixture
()
def
server
(
request
,
audio_assets
:
AudioTestAssets
):
args
=
[
"--enforce-eager"
,
"--limit-mm-per-prompt"
,
json
.
dumps
({
"audio"
:
len
(
audio_assets
)}),
]
+
MISTRAL_FORMAT_ARGS
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
,
env_dict
=
{
"VLLM_AUDIO_FETCH_TIMEOUT"
:
"30"
})
as
remote_server
:
yield
remote_server
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
with
server
.
get_async_client
()
as
async_client
:
yield
async_client
def
_get_prompt
(
audio_assets
,
question
):
tokenizer
=
MistralTokenizer
.
from_pretrained
(
MODEL_NAME
)
audios
=
[
Audio
.
from_file
(
str
(
audio_assets
[
i
].
get_local_path
()),
strict
=
False
)
for
i
in
range
(
len
(
audio_assets
))
]
audio_chunks
=
[
AudioChunk
(
input_audio
=
RawAudio
.
from_audio
(
audio
))
for
audio
in
audios
]
text_chunk
=
TextChunk
(
text
=
question
)
messages
=
[
UserMessage
(
content
=
[
*
audio_chunks
,
text_chunk
]).
to_openai
()]
return
tokenizer
.
apply_chat_template
(
messages
=
messages
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models_with_multiple_audios
(
vllm_runner
,
audio_assets
:
AudioTestAssets
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
vllm_prompt
=
_get_prompt
(
audio_assets
,
MULTI_AUDIO_PROMPT
)
run_multi_audio_test
(
vllm_runner
,
[(
vllm_prompt
,
[
audio
.
audio_and_sample_rate
for
audio
in
audio_assets
])],
MODEL_NAME
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tokenizer_mode
=
"mistral"
,
)
@
pytest
.
mark
.
asyncio
async
def
test_online_serving
(
client
,
audio_assets
:
AudioTestAssets
):
"""Exercises online serving with/without chunked prefill enabled."""
def
asset_to_chunk
(
asset
):
audio
=
Audio
.
from_file
(
str
(
asset
.
get_local_path
()),
strict
=
False
)
audio
.
format
=
"wav"
audio_dict
=
AudioChunk
.
from_audio
(
audio
).
to_openai
()
return
audio_dict
audio_chunks
=
[
asset_to_chunk
(
asset
)
for
asset
in
audio_assets
]
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
audio_chunks
,
{
"type"
:
"text"
,
"text"
:
f
"What's happening in these
{
len
(
audio_assets
)
}
audio clips?"
},
],
}]
chat_completion
=
await
client
.
chat
.
completions
.
create
(
model
=
MODEL_NAME
,
messages
=
messages
,
max_tokens
=
10
)
assert
len
(
chat_completion
.
choices
)
==
1
choice
=
chat_completion
.
choices
[
0
]
assert
choice
.
finish_reason
==
"length"
tests/models/multimodal/generation/test_whisper.py
View file @
711aa9d5
...
...
@@ -107,7 +107,7 @@ def run_test(
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
llm
=
vllm_model
.
model
llm
=
vllm_model
.
llm
sampling_params
=
SamplingParams
(
temperature
=
0
,
...
...
tests/models/multimodal/generation/vlm_utils/core.py
View file @
711aa9d5
...
...
@@ -85,7 +85,7 @@ def run_test(
enforce_eager
=
enforce_eager
,
task
=
task
,
**
vllm_runner_kwargs_
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
vllm_kwargs
:
dict
[
str
,
Any
]
=
{}
if
get_stop_token_ids
is
not
None
:
...
...
tests/models/multimodal/pooling/test_dse_qwen2_vl.py
View file @
711aa9d5
...
...
@@ -97,7 +97,7 @@ def _run_test(
dtype
=
dtype
,
enforce_eager
=
True
,
max_model_len
=
8192
)
as
vllm_model
:
tokenizer
=
vllm_model
.
model
.
get_tokenizer
()
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
texts
=
[
# this is necessary because vllm_model.embed will not apply any
# templating to the prompt, and therefore lacks an image_pad
...
...
tests/models/multimodal/pooling/test_jinavl_reranker.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Union
import
pytest
from
transformers
import
AutoModel
from
vllm.entrypoints.chat_utils
import
ChatCompletionContentPartImageParam
from
vllm.entrypoints.score_utils
import
ScoreMultiModalParam
from
....conftest
import
HfRunner
,
VllmRunner
model_name
=
"jinaai/jina-reranker-m0"
mm_processor_kwargs
=
{
"min_pixels"
:
3136
,
"max_pixels"
:
602112
,
}
limit_mm_per_prompt
=
{
"image"
:
2
}
def
vllm_reranker
(
vllm_runner
:
type
[
VllmRunner
],
model_name
:
str
,
dtype
:
str
,
query_strs
:
list
[
str
],
document_strs
:
list
[
str
],
query_type
:
str
=
"text"
,
doc_type
:
str
=
"text"
,
):
def
create_image_param
(
url
:
str
)
->
ChatCompletionContentPartImageParam
:
return
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"
{
url
}
"
}}
query
:
Union
[
list
[
str
],
ScoreMultiModalParam
]
if
query_type
==
"text"
:
query
=
query_strs
elif
query_type
==
"image"
:
query
=
ScoreMultiModalParam
(
content
=
[
create_image_param
(
url
)
for
url
in
query_strs
])
documents
:
Union
[
list
[
str
],
ScoreMultiModalParam
]
if
doc_type
==
"text"
:
documents
=
document_strs
elif
doc_type
==
"image"
:
documents
=
ScoreMultiModalParam
(
content
=
[
create_image_param
(
url
)
for
url
in
document_strs
])
with
vllm_runner
(
model_name
,
task
=
"score"
,
dtype
=
dtype
,
max_num_seqs
=
2
,
max_model_len
=
2048
,
mm_processor_kwargs
=
mm_processor_kwargs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
as
vllm_model
:
outputs
=
vllm_model
.
llm
.
score
(
query
,
documents
)
return
[
output
.
outputs
.
score
for
output
in
outputs
]
def
hf_reranker
(
hf_runner
:
type
[
HfRunner
],
model_name
:
str
,
dtype
:
str
,
query_strs
:
list
[
str
],
document_strs
:
list
[
str
],
query_type
:
str
=
"text"
,
doc_type
:
str
=
"text"
,
):
checkpoint_to_hf_mapper
=
{
"visual."
:
"model.visual."
,
"model."
:
"model.language_model."
,
}
data_pairs
=
[[
query_strs
[
0
],
d
]
for
d
in
document_strs
]
with
hf_runner
(
model_name
,
dtype
=
dtype
,
trust_remote_code
=
True
,
auto_cls
=
AutoModel
,
model_kwargs
=
{
"key_mapping"
:
checkpoint_to_hf_mapper
},
)
as
hf_model
:
return
hf_model
.
model
.
compute_score
(
data_pairs
,
max_length
=
2048
,
query_type
=
query_type
,
doc_type
=
doc_type
)
# Visual Documents Reranking
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
model_name
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_text_image
(
hf_runner
,
vllm_runner
,
model_name
,
dtype
):
query
=
[
"slm markdown"
]
documents
=
[
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
,
]
hf_outputs
=
hf_reranker
(
hf_runner
,
model_name
,
dtype
,
query
,
documents
,
"text"
,
"image"
)
vllm_outputs
=
vllm_reranker
(
vllm_runner
,
model_name
,
dtype
,
query
,
documents
,
"text"
,
"image"
)
assert
hf_outputs
[
0
]
==
pytest
.
approx
(
vllm_outputs
[
0
],
rel
=
0.02
)
assert
hf_outputs
[
1
]
==
pytest
.
approx
(
vllm_outputs
[
1
],
rel
=
0.02
)
# Textual Documents Reranking
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
model_name
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_text_text
(
hf_runner
,
vllm_runner
,
model_name
,
dtype
):
query
=
[
"slm markdown"
]
documents
=
[
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements."""
,
# noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?"
,
]
hf_outputs
=
hf_reranker
(
hf_runner
,
model_name
,
dtype
,
query
,
documents
,
"text"
,
"text"
)
vllm_outputs
=
vllm_reranker
(
vllm_runner
,
model_name
,
dtype
,
query
,
documents
,
"text"
,
"text"
)
assert
hf_outputs
[
0
]
==
pytest
.
approx
(
vllm_outputs
[
0
],
rel
=
0.02
)
assert
hf_outputs
[
1
]
==
pytest
.
approx
(
vllm_outputs
[
1
],
rel
=
0.02
)
# Image Querying for Textual Documents
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
model_name
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_image_text
(
hf_runner
,
vllm_runner
,
model_name
,
dtype
):
query
=
[
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
]
documents
=
[
"""We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
large language models. The models effectiveness results from two key innovations: (1) a three-stage
data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
refining, and critiquing web content extraction; and (2) a unified training framework combining
continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
lower computational requirements."""
,
# noqa: E501
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?"
,
]
hf_outputs
=
hf_reranker
(
hf_runner
,
model_name
,
dtype
,
query
,
documents
,
"image"
,
"text"
)
vllm_outputs
=
vllm_reranker
(
vllm_runner
,
model_name
,
dtype
,
query
,
documents
,
"image"
,
"text"
)
assert
hf_outputs
[
0
]
==
pytest
.
approx
(
vllm_outputs
[
0
],
rel
=
0.02
)
assert
hf_outputs
[
1
]
==
pytest
.
approx
(
vllm_outputs
[
1
],
rel
=
0.02
)
# Image Querying for Image Documents
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
model_name
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_model_image_image
(
hf_runner
,
vllm_runner
,
model_name
,
dtype
):
query
=
[
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
]
documents
=
[
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
,
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
,
]
hf_outputs
=
hf_reranker
(
hf_runner
,
model_name
,
dtype
,
query
,
documents
,
"image"
,
"image"
)
vllm_outputs
=
vllm_reranker
(
vllm_runner
,
model_name
,
dtype
,
query
,
documents
,
"image"
,
"image"
)
assert
hf_outputs
[
0
]
==
pytest
.
approx
(
vllm_outputs
[
0
],
rel
=
0.02
)
assert
hf_outputs
[
1
]
==
pytest
.
approx
(
vllm_outputs
[
1
],
rel
=
0.02
)
tests/models/multimodal/pooling/test_prithvi_mae.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
vllm.utils
import
set_default_torch_num_threads
from
....conftest
import
VllmRunner
def
generate_test_mm_data
():
mm_data
=
{
"pixel_values"
:
torch
.
full
((
6
,
512
,
512
),
1.0
,
dtype
=
torch
.
float16
),
"location_coords"
:
torch
.
full
((
1
,
2
),
1.0
,
dtype
=
torch
.
float16
),
}
return
mm_data
def
_run_test
(
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
)
->
None
:
prompt
=
[
{
# This model deals with no text input
"prompt_token_ids"
:
[
1
],
"multi_modal_data"
:
generate_test_mm_data
(),
}
for
_
in
range
(
10
)
]
with
(
set_default_torch_num_threads
(
1
),
vllm_runner
(
model
,
task
=
"embed"
,
dtype
=
torch
.
float16
,
enforce_eager
=
True
,
skip_tokenizer_init
=
True
,
# Limit the maximum number of sequences to avoid the
# test going OOM during the warmup run
max_num_seqs
=
32
,
)
as
vllm_model
,
):
vllm_model
.
encode
(
prompt
)
MODELS
=
[
"christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
]
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
test_models_image
(
hf_runner
,
vllm_runner
,
image_assets
,
model
:
str
,
)
->
None
:
_run_test
(
vllm_runner
,
model
,
)
tests/models/multimodal/processing/test_common.py
View file @
711aa9d5
...
...
@@ -161,6 +161,7 @@ def _test_processing_correctness(
_ADD_SPECIAL_TOKENS_OVERRIDES
=
{
"mllama"
:
False
,
"ovis"
:
False
,
"paligemma"
:
False
,
"ultravox"
:
False
,
"whisper"
:
False
,
}
...
...
@@ -291,7 +292,8 @@ def _test_processing_correctness_one(
os
.
path
.
join
(
models_path_prefix
,
"MiniMaxAI/MiniMax-VL-01"
),
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-D-0924"
),
os
.
path
.
join
(
models_path_prefix
,
"allenai/Molmo-7B-O-0924"
),
os
.
path
.
join
(
models_path_prefix
,
"nvidia/NVLM-D-72B"
),
os
.
path
.
join
(
models_path_prefix
,
"nvidia/NVLM-D-72B"
),
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
),
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Gemma2-9B"
),
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
),
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis2-1B"
),
...
...
@@ -302,7 +304,7 @@ def _test_processing_correctness_one(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Pixtral-12B-2409"
),
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen-VL-Chat"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-VL-3B-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Omni-3B"
),
...
...
tests/models/multimodal/processing/test_nemotron_vl.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
from
collections.abc
import
Mapping
from
typing
import
Optional
import
pytest
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
rescale_image_size
from
vllm.multimodal.processing
import
BaseMultiModalProcessor
from
....conftest
import
ImageTestAssets
from
...utils
import
build_model_context
def
_get_expected_num_patches
(
config
:
PretrainedConfig
,
image
:
Image
.
Image
,
num_imgs
:
int
,
min_num
:
int
,
max_num
:
int
,
):
from
vllm.model_executor.models.internvl
import
(
calculate_internvl_targets
,
get_internvl_target_ratios
)
width
,
height
=
image
.
size
blocks
,
_
,
_
=
calculate_internvl_targets
(
orig_width
=
width
,
orig_height
=
height
,
target_ratios
=
get_internvl_target_ratios
(
min_num
,
max_num
,
),
image_size
=
config
.
force_image_size
,
use_thumbnail
=
False
,
)
expected_num_patches
=
blocks
if
config
.
use_thumbnail
and
expected_num_patches
>
1
:
expected_num_patches
+=
1
return
expected_num_patches
def
_run_check
(
processor
:
BaseMultiModalProcessor
,
images
:
list
[
Image
.
Image
],
min_num
:
int
,
max_num
:
int
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
):
tokenizer
=
processor
.
info
.
get_tokenizer
()
config
=
processor
.
info
.
get_hf_config
()
image_processor
=
processor
.
info
.
get_image_processor
()
config
.
use_thumbnail
=
image_processor
.
use_thumbnail
prompt
=
"<image>"
*
len
(
images
)
mm_data
=
{
"image"
:
images
}
total_expected_num_patches
=
sum
(
_get_expected_num_patches
(
config
,
image
,
len
(
images
),
min_num
,
max_num
)
for
image
in
images
)
print
(
total_expected_num_patches
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
# Ensure we have the right number of placeholders per num_crops size
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
"<image>"
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values_flat"
].
shape
print
(
"Image token count:"
,
img_tok_count
,
"Pixel shape:"
,
pixel_shape
)
assert
img_tok_count
==
256
*
total_expected_num_patches
assert
pixel_shape
[
0
]
==
total_expected_num_patches
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
])
@
pytest
.
mark
.
parametrize
(
"size_factors"
,
[
# Single-scale
[
1.0
],
# Single-scale, batched
[
1.0
,
1.0
,
1.0
],
# Multi-scale
[
0.25
,
0.5
,
1.0
],
[
4.0
,
2.0
,
1.0
],
],
)
@
pytest
.
mark
.
parametrize
(
(
"min_dynamic_patch"
,
"max_dynamic_patch"
),
[(
1
,
1
),
(
1
,
2
),
(
1
,
4
),
(
1
,
8
),
(
2
,
4
),
(
4
,
8
)],
)
@
pytest
.
mark
.
parametrize
(
"dynamic_image_size"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
model_id
:
str
,
image_assets
:
ImageTestAssets
,
size_factors
:
list
[
int
],
min_dynamic_patch
:
int
,
max_dynamic_patch
:
int
,
dynamic_image_size
:
Optional
[
bool
],
kwargs_on_init
:
bool
,
):
mm_processor_kwargs
=
{
"min_dynamic_patch"
:
min_dynamic_patch
,
"max_dynamic_patch"
:
max_dynamic_patch
,
"dynamic_image_size"
:
dynamic_image_size
,
}
ctx
=
build_model_context
(
model_id
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
len
(
size_factors
)},
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
min_num
=
min_dynamic_patch
if
dynamic_image_size
else
1
max_num
=
max_dynamic_patch
if
dynamic_image_size
else
1
_run_check
(
processor
,
[
rescale_image_size
(
image_assets
[
0
].
pil_image
,
f
)
for
f
in
size_factors
],
min_num
,
max_num
,
hf_processor_mm_kwargs
,
)
tests/models/multimodal/processing/test_transformers.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.assets.image
import
ImageAsset
from
vllm.config
import
ModelConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
# yapf: disable
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
])
def
test_multimodal_processor
(
model_id
):
model_config
=
ModelConfig
(
model
=
model_id
,
model_impl
=
"transformers"
,
)
mm_processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
)
image_pil
=
ImageAsset
(
'cherry_blossom'
).
pil_image
mm_data
=
{
"image"
:
image_pil
}
str_prompt
=
"<|im_start|>user <image>
\n
What is the content of this image?<|im_end|><|im_start|>assistant
\n
"
# noqa: E501
str_processed_inputs
=
mm_processor
.
apply
(
prompt
=
str_prompt
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
ids_prompt
=
[
151644
,
872
,
220
,
151646
,
198
,
3838
,
374
,
279
,
2213
,
315
,
419
,
2168
,
30
,
151645
,
151644
,
77091
,
198
]
ids_processed_inputs
=
mm_processor
.
apply
(
prompt
=
ids_prompt
,
mm_data
=
mm_data
,
hf_processor_mm_kwargs
=
{},
)
assert
str_processed_inputs
[
"prompt"
]
==
ids_processed_inputs
[
"prompt"
]
tests/quantization/test_bitsandbytes.py
→
tests/
models/
quantization/test_bitsandbytes.py
View file @
711aa9d5
...
...
@@ -17,8 +17,8 @@ from ..utils import models_path_prefix
from
vllm.platforms
import
current_platform
from
..
models
.utils
import
c
heck_embeddings_close
from
..utils
import
c
ompare_two_settings
,
create_new_process_for_each_test
from
...utils
import
c
ompare_two_settings
,
multi_gpu_test
from
..utils
import
c
heck_embeddings_close
,
check_logprobs_close
models_4bit_to_test
=
[
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
"quantize opt model inflight"
),
...
...
@@ -30,6 +30,10 @@ models_4bit_to_embedding_test = [
(
"intfloat/e5-mistral-7b-instruct"
,
"quantize embedding model inflight"
),
]
models_4bit_to_moe_test
=
[
(
"allenai/OLMoE-1B-7B-0125-Instruct"
,
"quantize moe model inflight"
),
]
models_pre_qaunt_4bit_to_test
=
[
(
os
.
path
.
join
(
models_path_prefix
,
'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'
),
'read pre-quantized 4-bit FP4 model'
),
...
...
@@ -46,7 +50,6 @@ models_pre_quant_8bit_to_test = [
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
)
or
current_platform
.
is_rocm
(),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_test
)
@
create_new_process_for_each_test
()
def
test_load_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
...
...
@@ -60,7 +63,6 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_pre_qaunt_4bit_to_test
)
@
create_new_process_for_each_test
()
def
test_load_pre_quant_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
...
...
@@ -72,7 +74,6 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_pre_quant_8bit_to_test
)
@
create_new_process_for_each_test
()
def
test_load_8bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
...
...
@@ -80,12 +81,11 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name
,
True
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
'Test requires at least 2 GPUs.'
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
)
or
current_platform
.
is_rocm
(),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_test
)
@
create_new_process_for_each_test
(
)
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_load_tp_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
...
...
@@ -100,12 +100,10 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
vllm_tp_size
=
2
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
'Test requires at least 2 GPUs.'
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_test
)
@
create_new_process_for_each_test
(
)
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_load_pp_4bit_bnb_model
(
model_name
,
description
)
->
None
:
common_args
=
[
"--disable-log-stats"
,
...
...
@@ -126,12 +124,40 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
compare_two_settings
(
model_name
,
common_args
,
pp_args
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_moe_test
)
def
test_4bit_bnb_moe_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
hf_model_kwargs
=
dict
(
quantization_config
=
BitsAndBytesConfig
(
load_in_4bit
=
True
,
bnb_4bit_quant_type
=
"nf4"
,
bnb_4bit_use_double_quant
=
True
,
))
with
vllm_runner
(
model_name
,
quantization
=
'bitsandbytes'
,
enforce_eager
=
False
)
as
llm
:
vllm_outputs
=
llm
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
=
32
,
num_logprobs
=
5
)
with
hf_runner
(
model_name
,
model_kwargs
=
hf_model_kwargs
)
as
llm
:
transformers_outputs
=
llm
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
=
32
,
num_logprobs
=
5
)
check_logprobs_close
(
outputs_0_lst
=
transformers_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"transformers"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_embedding_test
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
create_new_process_for_each_test
()
def
test_4bit_bnb_embedding_model
(
model_name
,
description
,
...
...
@@ -150,6 +176,13 @@ def test_4bit_bnb_embedding_model(
example_prompts
=
[
str
(
s
).
strip
()
for
s
in
example_prompts
]
# Inflight 4bit quantization
with
vllm_runner
(
model_name
,
task
=
"embed"
,
dtype
=
dtype
,
gpu_memory_utilization
=
0.5
,
quantization
=
"bitsandbytes"
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
example_prompts
)
hf_model_kwargs
=
dict
(
quantization_config
=
BitsAndBytesConfig
(
load_in_4bit
=
True
))
with
hf_runner
(
...
...
@@ -160,12 +193,6 @@ def test_4bit_bnb_embedding_model(
)
as
hf_model
:
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
with
vllm_runner
(
model_name
,
task
=
"embed"
,
dtype
=
dtype
,
gpu_memory_utilization
=
0.5
,
quantization
=
"bitsandbytes"
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
example_prompts
)
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
...
...
@@ -193,7 +220,8 @@ def validate_generated_texts(hf_runner,
model_name
,
pre_quant
=
False
,
hf_model_kwargs
=
None
,
vllm_tp_size
=
1
):
vllm_tp_size
=
1
,
max_tokens
=
8
):
# NOTE: run vLLM first, as it requires a clean process
# when using distributed inference
...
...
@@ -201,7 +229,8 @@ def validate_generated_texts(hf_runner,
quantization
=
None
if
pre_quant
else
'bitsandbytes'
,
tensor_parallel_size
=
vllm_tp_size
,
enforce_eager
=
False
)
as
llm
:
vllm_outputs
=
llm
.
generate_greedy
(
prompts
,
8
)
vllm_outputs
=
llm
.
generate_greedy
(
prompts
,
max_tokens
)
vllm_logs
=
log_generated_texts
(
prompts
,
vllm_outputs
,
"VllmRunner"
)
# Clean up the GPU memory for the next test
...
...
@@ -213,19 +242,17 @@ def validate_generated_texts(hf_runner,
# Run with HF runner
with
hf_runner
(
model_name
,
model_kwargs
=
hf_model_kwargs
)
as
llm
:
hf_outputs
=
llm
.
generate_greedy
(
prompts
,
8
)
hf_outputs
=
llm
.
generate_greedy
(
prompts
,
max_tokens
)
hf_logs
=
log_generated_texts
(
prompts
,
hf_outputs
,
"HfRunner"
)
# Clean up the GPU memory for the next test
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
# Compare the generated strings
for
hf_log
,
vllm_log
in
zip
(
hf_logs
,
vllm_logs
):
hf_str
=
hf_log
[
"generated_text"
]
vllm_str
=
vllm_log
[
"generated_text"
]
prompt
=
hf_log
[
"prompt"
]
assert
hf_str
==
vllm_str
,
(
f
"Model:
{
model_name
}
"
f
"Mismatch between HF and vLLM outputs:
\n
"
f
"Prompt:
{
prompt
}
\n
"
...
...
tests/models/quantization/test_modelopt.py
View file @
711aa9d5
...
...
@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
example_prompts
,
model_name
)
->
None
:
model
=
LLM
(
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
MAX_MODEL_LEN
,
trust_remote_code
=
True
,
...
...
@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for
prompt
in
formatted_prompts
:
outputs
=
model
.
generate
(
prompt
,
params
)
outputs
=
llm
.
generate
(
prompt
,
params
)
generations
.
append
(
outputs
[
0
].
outputs
[
0
].
text
)
del
model
del
llm
print
(
model_name
,
generations
)
expected_strs
=
EXPECTED_STRS_MAP
[
model_name
]
...
...
tests/models/quantization/test_nvfp4.py
View file @
711aa9d5
...
...
@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
reason
=
"modelopt_fp4 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
def
test_models
(
example_prompts
,
model_name
)
->
None
:
model
=
LLM
(
llm
=
LLM
(
model
=
model_name
,
max_model_len
=
MAX_MODEL_LEN
,
trust_remote_code
=
True
,
...
...
@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for
prompt
in
formatted_prompts
:
outputs
=
model
.
generate
(
prompt
,
params
)
outputs
=
llm
.
generate
(
prompt
,
params
)
generations
.
append
(
outputs
[
0
].
outputs
[
0
].
text
)
del
model
del
llm
print
(
model_name
,
generations
)
expected_strs
=
EXPECTED_STRS_MAP
[
model_name
]
...
...
tests/models/registry.py
View file @
711aa9d5
...
...
@@ -139,16 +139,20 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"AquilaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/AquilaChat2-7B"
),
trust_remote_code
=
True
),
"ArceeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"arcee-ai/AFM-4.5B-Base"
),
is_available_online
=
False
),
"ArcticForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Snowflake/snowflake-arctic-instruct"
),
trust_remote_code
=
True
),
"BaiChuanForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan-7B"
),
trust_remote_code
=
True
),
"BaichuanForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"baichuan-inc/Baichuan2-7B-chat"
),
trust_remote_code
=
True
),
"BambaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ibm-ai-platform/Bamba-9B"
),
extras
=
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"hmellor/tiny-random-BambaForCausalLM"
)}),
# noqa: E501
"BloomForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"bigscience/bloom-560m"
),
{
"1b"
:
os
.
path
.
join
(
models_path_prefix
,
"bigscience/bloomz-1b1"
)}),
"BailingMoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"inclusionAI/Ling-lite-1.5"
),
trust_remote_code
=
True
),
"BambaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ibm-ai-platform/Bamba-9B"
),
extras
=
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"hmellor/tiny-random-BambaForCausalLM"
)}),
# noqa: E501
"BloomForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"bigscience/bloom-560m"
),
{
"1b"
:
os
.
path
.
join
(
models_path_prefix
,
"bigscience/bloomz-1b1"
)}),
"ChatGLMModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/chatglm3-6b"
),
trust_remote_code
=
True
,
max_transformers_version
=
"4.48"
),
...
...
@@ -166,14 +170,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"DeepseekV3ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V3"
),
# noqa: E501
trust_remote_code
=
True
),
"Ernie4_5_ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"baidu/ERNIE-4.5-0.3B-PT"
),
trust_remote_code
=
True
),
"Ernie4_5_MoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"baidu/ERNIE-4.5-21B-A3B-PT"
),
trust_remote_code
=
True
),
"ExaoneForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
)),
# noqa: E501
"Fairseq2LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"mgleize/fairseq2-dummy-Llama-3.2-1B"
)),
# noqa: E501
"FalconForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
)),
"FalconH1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/Falcon-H1-0.5B-Base"
),
"Ernie4_5_ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"baidu/ERNIE-4.5-0.3B-PT"
),
min_transformers_version
=
"4.54"
),
"Ernie4_5_MoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"baidu/ERNIE-4.5-21B-A3B-PT"
),
min_transformers_version
=
"4.54"
),
"ExaoneForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
)),
# noqa: E501
"Exaone4ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-4.0-32B"
)),
# noqa: E501
"Fairseq2LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"mgleize/fairseq2-dummy-Llama-3.2-1B"
)),
# noqa: E501
"FalconForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
)),
"FalconH1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/Falcon-H1-0.5B-Base"
),
min_transformers_version
=
"4.53"
),
"GemmaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-1.1-2b-it"
)),
"Gemma2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-9b"
)),
...
...
@@ -198,7 +203,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"HunYuanMoEV1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tencent/Hunyuan-A13B-Instruct"
),
trust_remote_code
=
True
),
"InternLMForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm-chat-7b"
),
"HunYuanDenseV1ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tencent/Hunyuan-7B-Instruct-0124"
),
trust_remote_code
=
True
),
"InternLMForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm-chat-7b"
),
trust_remote_code
=
True
),
"InternLM2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-chat-7b"
),
trust_remote_code
=
True
),
...
...
@@ -222,6 +229,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"MiniCPM3ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM3-4B"
),
trust_remote_code
=
True
),
"MiniMaxForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"MiniMaxAI/MiniMax-Text-01-hf"
),
min_transformers_version
=
"4.53"
),
"MiniMaxText01ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"MiniMaxAI/MiniMax-Text-01"
),
trust_remote_code
=
True
,
revision
=
"a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"
),
# noqa: E501
...
...
@@ -243,14 +252,14 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
{
"1b"
:
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-iml-max-1.3b"
)}),
"OrionForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"OrionStarAI/Orion-14B-Chat"
),
trust_remote_code
=
True
),
"PersimmonForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"adept/persimmon-8b-chat"
)),
"PhiForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/phi-2"
)),
"Phi3ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-mini-4k-instruct"
)),
# Blocksparse attention not supported in V1 yet
"Phi3SmallForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-small-8k-instruct"
)
,
trust_remote_code
=
True
,
v0_only
=
True
),
"PhiMoEForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
),
"PersimmonForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"adept/persimmon-8b-chat"
)),
"PhiForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/phi-2"
)),
"Phi3ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-mini-4k-instruct"
)),
"Phi4FlashForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-4-mini-flash-reasoning"
),
# noqa: E501
trust_remote_code
=
True
,
v0_only
=
True
,
max_model_len
=
10240
),
"PhiMoEForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
),
trust_remote_code
=
True
),
"Plamo2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"pfnet/plamo-2-1b"
),
trust_remote_code
=
True
),
...
...
@@ -258,16 +267,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-0.5B-Instruct"
),
extras
=
{
"2.5"
:
"Qwen/Qwen2.5-0.5B-Instruct"
}),
# noqa: E501
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
)),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen3-8B"
)),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen3-30B-A3B"
)),
"Qwen3ForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
)),
# noqa: E501
"RWForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-40b"
)),
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-zephyr-3b"
)),
# noqa: E501
"StableLmForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
)),
"Starcoder2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
)),
"SolarForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"upstage/solar-pro-preview-instruct"
)),
"TeleChat2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Tele-AI/TeleChat2-3B"
),
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
)),
"Qwen3ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen3-8B"
)),
"Qwen3MoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen3-30B-A3B"
)),
"RWForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-40b"
)),
"StableLMEpochForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-zephyr-3b"
)),
# noqa: E501
"StableLmForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"stabilityai/stablelm-3b-4e1t"
)),
"Starcoder2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"bigcode/starcoder2-3b"
)),
"SolarForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"upstage/solar-pro-preview-instruct"
)),
"TeleChat2ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Tele-AI/TeleChat2-3B"
),
trust_remote_code
=
True
),
"TeleFLMForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"CofeAI/FLM-2-52B-Instruct-2407"
),
trust_remote_code
=
True
),
...
...
@@ -290,28 +298,27 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
_EMBEDDING_EXAMPLE_MODELS
=
{
# [Text-only]
"BertModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-base-en-v1.5"
),
v0_only
=
True
),
"Gemma2Model"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
),
v0_only
=
True
),
# noqa: E501
"GPT2ForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nie3e/sentiment-polish-gpt2-small"
)),
# noqa: E501
"GritLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"parasail-ai/GritLM-7B-vllm"
)),
"GteModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Snowflake/snowflake-arctic-embed-m-v2.0"
),
"BertModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-base-en-v1.5"
),
v0_only
=
True
),
"Gemma2Model"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
),
v0_only
=
True
),
# noqa: E501
"GritLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"parasail-ai/GritLM-7B-vllm"
)),
"GteModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Snowflake/snowflake-arctic-embed-m-v2.0"
),
trust_remote_code
=
True
),
"GteNewModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-base-en-v1.5"
),
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GteNewModel"
]}),
# noqa: E501
"InternLM2ForRewardModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-1_8b-reward"
),
trust_remote_code
=
True
),
"JambaForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-reward-dev"
)),
# noqa: E501
"LlamaModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"llama"
),
is_available_online
=
False
),
"MistralModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)),
"ModernBertModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-modernbert-base"
),
"JambaForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-reward-dev"
)),
# noqa: E501
"LlamaModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"llama"
),
is_available_online
=
False
),
"MistralModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
)),
"ModernBertModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-modernbert-base"
),
trust_remote_code
=
True
,
v0_only
=
True
),
"NomicBertModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nomic-ai/nomic-embed-text-v2-moe"
),
"NomicBertModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nomic-ai/nomic-embed-text-v2-moe"
),
trust_remote_code
=
True
,
v0_only
=
True
),
# noqa: E501
"Qwen2Model"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ssmits/Qwen2-7B-Instruct-embed-base"
)),
"Qwen2ForRewardModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-RM-72B"
)),
"Qwen2ForProcessRewardModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-PRM-7B"
)),
"Qwen2ForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"jason9693/Qwen2.5-1.5B-apeach"
)),
# noqa: E501
"RobertaModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/stsb-roberta-base-v2"
),
v0_only
=
True
),
# noqa: E501
"RobertaForMaskedLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-roberta-large-v1"
),
v0_only
=
True
),
# noqa: E501
"XLMRobertaModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-small"
),
v0_only
=
True
),
# noqa: E501
...
...
@@ -324,12 +331,27 @@ _EMBEDDING_EXAMPLE_MODELS = {
is_available_online
=
False
),
# noqa: E501
}
_CROSS_ENCODER_EXAMPLE_MODELS
=
{
# [Text-only]
_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS
=
{
# [Decoder-only]
"GPT2ForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nie3e/sentiment-polish-gpt2-small"
)),
# noqa: E501
# [Cross-encoder]
"BertForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"cross-encoder/ms-marco-MiniLM-L-6-v2"
),
v0_only
=
True
),
# noqa: E501
"ModernBertForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-reranker-modernbert-base"
),
v0_only
=
True
),
# noqa: E501
"RobertaForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"cross-encoder/quora-roberta-base"
),
v0_only
=
True
),
# noqa: E501
"XLMRobertaForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-reranker-v2-m3"
),
v0_only
=
True
),
# noqa: E501
"ModernBertForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Alibaba-NLP/gte-reranker-modernbert-base"
),
v0_only
=
True
),
# noqa: E501
}
_AUTOMATIC_CONVERTED_MODELS
=
{
# Use as_seq_cls_model for automatic conversion
"GemmaForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-reranker-v2-gemma)"
,
# noqa: E501
v0_only
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GemmaForSequenceClassification"
],
# noqa: E501
"classifier_from_token"
:
[
"Yes"
],
# noqa: E501
"method"
:
"no_post_processing"
}),
# noqa: E501
"LlamaForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Skywork/Skywork-Reward-V2-Llama-3.2-1B"
)),
# noqa: E501
"Qwen2ForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"jason9693/Qwen2.5-1.5B-apeach"
)),
# noqa: E501
"Qwen3ForSequenceClassification"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
)),
# noqa: E501
}
_MULTIMODAL_EXAMPLE_MODELS
=
{
...
...
@@ -350,12 +372,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"GLM4VForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/glm-4v-9b"
),
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
"Glm4vForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/GLM-4.1V-9B-Thinking"
),
min_transformers_version
=
"4.53"
),
# noqa: E501
"Glm4MoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/GLM-4.5"
),
"Glm4vForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/GLM-4.1V-9B-Thinking"
),
min_transformers_version
=
"4.53"
),
# noqa: E501
"Glm4MoeForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/GLM-4.5"
),
min_transformers_version
=
"4.54"
,
is_available_online
=
False
),
# noqa: E501
"H2OVLChatModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-800m"
),
extras
=
{
"2b"
:
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-2b"
)},
# noqa: E501
"H2OVLChatModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-800m"
),
extras
=
{
"2b"
:
os
.
path
.
join
(
models_path_prefix
,
"h2oai/h2ovl-mississippi-2b"
)},
# noqa: E501
max_transformers_version
=
"4.48"
,
# noqa: E501
transformers_version_reason
=
"HF model is not compatible."
),
# noqa: E501
"InternVLChatModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL2-1B"
),
...
...
@@ -364,12 +386,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
),
# noqa: E501
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceTB/SmolVLM-256M-Instruct"
)}),
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Kwai-Keye/Keye-VL-8B-Preview"
,
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Kwai-Keye/Keye-VL-8B-Preview"
)
,
# noqa: E501
trust_remote_code
=
True
),
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Instruct"
),
# noqa: E501
extras
=
{
"thinking"
:
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Thinking"
)},
# noqa: E501
trust_remote_code
=
True
),
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
# noqa: E501
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
# noqa: E501
max_model_len
=
10240
),
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
),
extras
=
{
"mistral"
:
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
),
# noqa: E501
...
...
@@ -398,9 +420,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"NVLM_D"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nvidia/NVLM-D-72B"
),
trust_remote_code
=
True
),
"Llama_Nemotron_Nano_VL"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
),
# noqa: E501
trust_remote_code
=
True
),
"PaliGemmaForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"google/paligemma-3b-mix-224"
),
# noqa: E501
extras
=
{
"v2"
:
os
.
path
.
join
(
models_path_prefix
,
"google/paligemma2-3b-ft-docci-448"
)}),
# noqa: E501
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
)
,
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3-vision-128k-instruct"
,
trust_remote_code
=
True
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
...
...
@@ -418,7 +443,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides
=
{
"architectures"
:
[
"QwenVLForConditionalGeneration"
]}),
# noqa: E501
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-Audio-7B-Instruct"
)),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-VL-3B-Instruct"
)),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-VL-3B-Instruct"
),
# noqa: E501
max_model_len
=
4096
),
"Qwen2_5OmniModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Omni-3B"
)),
"Qwen2_5OmniForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Omni-7B-AWQ"
)),
# noqa: E501
"SkyworkR1VChatModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Skywork/Skywork-R1V-38B"
)),
...
...
@@ -429,6 +455,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides
=
{
"architectures"
:
[
"TarsierForConditionalGeneration"
]}),
# noqa: E501
"Tarsier2ForConditionalGeneration"
:
_HfExamplesInfo
(
"omni-research/Tarsier2-Recap-7b"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"Tarsier2ForConditionalGeneration"
]}),
# noqa: E501
"VoxtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Voxtral-Mini-3B-2507"
,
min_transformers_version
=
"4.54"
,
# disable this temporarily until we support HF format
is_available_online
=
False
,
),
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
...
...
@@ -436,17 +468,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"Isotr0py/Florence-2-tokenizer"
),
# noqa: E501
trust_remote_code
=
True
),
# noqa: E501
"MllamaForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision-Instruct"
)),
# noqa: E501
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)),
# noqa: E501
"WhisperForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-large-v3"
)),
# noqa: E501
# [Cross-encoder]
"JinaVLForRanking"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"jinaai/jina-reranker-m0"
)),
# noqa: E501
}
_SPECULATIVE_DECODING_EXAMPLE_MODELS
=
{
"EAGLEModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"abhigoyal/vllm-eagle-llama-68m-random"
)),
# noqa: E501
"MedusaModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"abhigoyal/vllm-medusa-llama-68m-random"
)),
# noqa: E501
"MLPSpeculatorPreTrainedModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-160m"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"ibm-ai-platform/llama-160m-accelerator"
)),
# noqa: E501
# Temporarily disabled.
# TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
# speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501
"DeepSeekMTPModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_main_random"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"luccafong/deepseek_mtp_draft_random"
),
# noqa: E501
trust_remote_code
=
True
),
...
...
@@ -454,32 +488,39 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3-Instruct-8B"
),
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Meta-Llama-3-8B-Instruct"
)),
# noqa: E501
"Eagle3LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
),
# noqa: E501
"Eagle3LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
)
)
,
# noqa: E501
trust_remote_code
=
True
,
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
),
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.1-8B-Instruct"
)),
"EagleMiniCPMForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-1B-sft-bf16"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
),
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.1-8B-Instruct"
)),
"EagleLlama4ForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"
),
trust_remote_code
=
True
,
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"
),
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)),
# noqa: E501
"EagleMiniCPMForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-1B-sft-bf16"
),
trust_remote_code
=
True
,
is_available_online
=
False
,
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
),
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
)),
"Glm4MoeMTPModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/GLM-4.5"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"THUDM/GLM-4.5"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
),
tokenizer
=
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
)),
"Glm4MoeMTPModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"THUDM/GLM-4.5"
),
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"THUDM/GLM-4.5"
),
min_transformers_version
=
"4.54"
,
is_available_online
=
False
),
"MiMoMTPModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
),
"MiMoMTPModel"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
),
trust_remote_code
=
True
,
speculative_model
=
os
.
path
.
join
(
models_path_prefix
,
"XiaomiMiMo/MiMo-7B-RL"
))
}
_TRANSFORMERS_MODELS
=
{
"TransformersForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ArthurZ/Ilama-3.2-1B"
),
trust_remote_code
=
True
),
# noqa: E501
"TransformersForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"hmellor/Ilama-3.2-1B"
),
trust_remote_code
=
True
),
# noqa: E501
"TransformersForMultimodalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"OpenGVLab/InternVL3-1B-hf"
)),
}
_EXAMPLE_MODELS
=
{
**
_TEXT_GENERATION_EXAMPLE_MODELS
,
**
_EMBEDDING_EXAMPLE_MODELS
,
**
_
CROSS_ENCODER
_EXAMPLE_MODELS
,
**
_
SEQUENCE_CLASSIFICATION
_EXAMPLE_MODELS
,
**
_MULTIMODAL_EXAMPLE_MODELS
,
**
_SPECULATIVE_DECODING_EXAMPLE_MODELS
,
**
_TRANSFORMERS_MODELS
,
...
...
@@ -511,4 +552,5 @@ class HfExampleModels:
raise
ValueError
(
f
"No example model defined for
{
model_id
}
"
)
HF_EXAMPLE_MODELS
=
HfExampleModels
(
_EXAMPLE_MODELS
)
\ No newline at end of file
HF_EXAMPLE_MODELS
=
HfExampleModels
(
_EXAMPLE_MODELS
)
AUTO_EXAMPLE_MODELS
=
HfExampleModels
(
_AUTOMATIC_CONVERTED_MODELS
)
tests/models/test_initialization.py
View file @
711aa9d5
...
...
@@ -12,20 +12,36 @@ from vllm.utils import GiB_bytes
from
vllm.v1.core.kv_cache_utils
import
get_kv_cache_config
from
vllm.v1.engine.core
import
EngineCore
as
V1EngineCore
from
.registry
import
HF_EXAMPLE_MODELS
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
HF_EXAMPLE_MODELS
.
get_supported_archs
())
def
test_can_initialize
(
model_arch
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
):
model_info
=
HF_EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
from
..utils
import
create_new_process_for_each_test
from
.registry
import
AUTO_EXAMPLE_MODELS
,
HF_EXAMPLE_MODELS
,
HfExampleModels
@
create_new_process_for_each_test
()
def
can_initialize
(
model_arch
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
,
EXAMPLE_MODELS
:
HfExampleModels
):
"""The reason for using create_new_process_for_each_test is to avoid
the WARNING:
"We must use the 'spawn' multiprocessing start method. Overriding
VLLM_WORKER_MULTIPROC_METHOD to 'spawn'."
The spawn process causes the _initialize_kv_caches_v1 function below to
become ineffective.
"""
model_info
=
EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
# FIXME: Possible memory leak in the previous tests?
if
model_arch
in
(
"GraniteSpeechForConditionalGeneration"
,
if
model_arch
in
(
"Glm4vForConditionalGeneration"
,
"GraniteSpeechForConditionalGeneration"
,
"KimiVLForConditionalGeneration"
):
pytest
.
skip
(
"Avoid OOM"
)
if
model_arch
in
(
"Llama4ForCausalLM"
,
"EagleLlama4ForCausalLM"
):
from
vllm.model_executor.models.llama4
import
Llama4ForCausalLM
from
vllm.model_executor.models.registry
import
ModelRegistry
ModelRegistry
.
register_model
(
"Llama4ForCausalLM"
,
Llama4ForCausalLM
)
# Avoid OOM and reduce initialization time by only using 1 layer
def
hf_overrides
(
hf_config
:
PretrainedConfig
)
->
PretrainedConfig
:
hf_config
.
update
(
model_info
.
hf_overrides
)
...
...
@@ -33,13 +49,18 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
text_config
=
hf_config
.
get_text_config
()
# Ensure at least 2 expert per group
# Since `grouped_topk` assums top-2
# Since `grouped_topk` assum
e
s top-2
n_group
=
getattr
(
text_config
,
'n_group'
,
None
)
num_experts
=
n_group
*
2
if
n_group
is
not
None
else
2
# we use three layers for Gemma-3n to check
# both normal layer and kv_shared_layer
num_hidden_layers
=
(
3
if
model_arch
==
"Gemma3nForConditionalGeneration"
else
1
)
text_config
.
update
({
"num_layers"
:
1
,
"num_hidden_layers"
:
1
,
"num_hidden_layers"
:
num_hidden_layers
,
"num_experts"
:
num_experts
,
"num_experts_per_tok"
:
2
,
"num_local_experts"
:
num_experts
,
...
...
@@ -47,6 +68,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
"first_k_dense_replace"
:
0
,
# To avoid OOM on DeepSeek-V3
"n_routed_experts"
:
num_experts
,
# For Gemma-3n
"num_kv_shared_layers"
:
1
,
})
if
hasattr
(
hf_config
,
"vision_config"
):
...
...
@@ -86,6 +109,9 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
_initialize_kv_caches_v1
),
monkeypatch
.
context
()
as
m
):
if
model_info
.
v0_only
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
if
model_arch
==
"Phi4FlashForCausalLM"
:
# Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"DIFFERENTIAL_FLASH_ATTN"
)
LLM
(
model_info
.
default
,
tokenizer
=
model_info
.
tokenizer
,
...
...
@@ -102,3 +128,15 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
load_format
=
"dummy"
,
hf_overrides
=
hf_overrides
,
)
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
HF_EXAMPLE_MODELS
.
get_supported_archs
())
def
test_can_initialize
(
model_arch
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
):
can_initialize
(
model_arch
,
monkeypatch
,
HF_EXAMPLE_MODELS
)
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
AUTO_EXAMPLE_MODELS
.
get_supported_archs
())
def
test_implicit_converted_models
(
model_arch
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
):
can_initialize
(
model_arch
,
monkeypatch
,
AUTO_EXAMPLE_MODELS
)
tests/models/test_registry.py
View file @
711aa9d5
...
...
@@ -80,11 +80,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"model_arch,is_pp,init_cuda"
,
[
(
os
.
path
.
join
(
models_path_prefix
,
"MLPSpeculatorPreTrainedModel"
),
False
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"DeepseekV2ForCausalLM"
),
True
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen2VLForConditionalGeneration"
),
True
,
True
),
])
@
pytest
.
mark
.
parametrize
(
"model_arch,is_pp,init_cuda"
,
[
# TODO(woosuk): Re-enable this once the MLP Speculator is supported
# in V1.
# ("MLPSpeculatorPreTrainedModel", False, False),
(
os
.
path
.
join
(
models_path_prefix
,
"DeepseekV2ForCausalLM"
),
True
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen2VLForConditionalGeneration"
),
True
,
True
),
])
def
test_registry_is_pp
(
model_arch
,
is_pp
,
init_cuda
):
assert
ModelRegistry
.
is_pp_supported_model
(
model_arch
)
is
is_pp
...
...
tests/models/test_transformers.py
View file @
711aa9d5
...
...
@@ -57,8 +57,8 @@ def check_implementation(
@
pytest
.
mark
.
parametrize
(
"model,model_impl"
,
[
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
"transformers"
),
(
os
.
path
.
join
(
models_path_prefix
,
"ArthurZ
/Ilama-3.2-1B"
,
"auto"
)
)
,
# CUSTOM CODE
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
"transformers"
),
(
os
.
path
.
join
(
models_path_prefix
,
"hmellor
/Ilama-3.2-1B"
)
,
"auto"
),
# CUSTOM CODE
])
# trust_remote_code=True by default
def
test_models
(
hf_runner
:
type
[
HfRunner
],
...
...
@@ -105,7 +105,7 @@ def test_distributed(
reason
=
"bitsandbytes quantization is currently not supported in rocm."
)
@
pytest
.
mark
.
parametrize
(
"model, quantization_kwargs"
,
[
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
{
"quantization"
:
"bitsandbytes"
,
},
...
...
@@ -139,4 +139,39 @@ def test_quantization(
outputs_1_lst
=
vllm_outputs
,
name_0
=
"transformers"
,
name_1
=
"vllm"
,
)
\ No newline at end of file
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"jason9693/Qwen2.5-1.5B-apeach"
)],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_classify
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
monkeypatch
,
)
->
None
:
import
torch
from
transformers
import
AutoModelForSequenceClassification
with
vllm_runner
(
model
,
max_model_len
=
512
,
dtype
=
dtype
,
model_impl
=
"transformers"
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
classify
(
example_prompts
)
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForSequenceClassification
)
as
hf_model
:
hf_outputs
=
hf_model
.
classify
(
example_prompts
)
for
hf_output
,
vllm_output
in
zip
(
hf_outputs
,
vllm_outputs
):
hf_output
=
torch
.
tensor
(
hf_output
)
vllm_output
=
torch
.
tensor
(
vllm_output
)
assert
torch
.
allclose
(
hf_output
,
vllm_output
,
1e-3
if
dtype
==
"float"
else
1e-2
)
Prev
1
…
14
15
16
17
18
19
20
21
22
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment