Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0da93439
Commit
0da93439
authored
Mar 26, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori
parents
25f2f756
298e5108
Changes
613
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
751 additions
and
133 deletions
+751
-133
tests/kernels/quantization/test_mxfp4_triton_ep.py
tests/kernels/quantization/test_mxfp4_triton_ep.py
+0
-83
tests/kernels/quantization/test_rocm_skinny_gemms.py
tests/kernels/quantization/test_rocm_skinny_gemms.py
+5
-4
tests/lora/conftest.py
tests/lora/conftest.py
+5
-0
tests/lora/test_lora_manager.py
tests/lora/test_lora_manager.py
+189
-0
tests/lora/test_lora_utils.py
tests/lora/test_lora_utils.py
+60
-0
tests/lora/test_qwen35_densemoel_lora.py
tests/lora/test_qwen35_densemoel_lora.py
+132
-0
tests/model_executor/layers/test_rocm_unquantized_gemm.py
tests/model_executor/layers/test_rocm_unquantized_gemm.py
+89
-0
tests/models/language/pooling/test_colbert.py
tests/models/language/pooling/test_colbert.py
+16
-0
tests/models/multimodal/generation/test_common.py
tests/models/multimodal/generation/test_common.py
+4
-1
tests/models/multimodal/generation/test_granite_speech.py
tests/models/multimodal/generation/test_granite_speech.py
+5
-1
tests/models/multimodal/generation/test_keye.py
tests/models/multimodal/generation/test_keye.py
+1
-5
tests/models/multimodal/generation/test_nemotron_parse.py
tests/models/multimodal/generation/test_nemotron_parse.py
+44
-11
tests/models/multimodal/generation/vlm_utils/builders.py
tests/models/multimodal/generation/vlm_utils/builders.py
+1
-4
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+27
-19
tests/models/multimodal/pooling/test_colpali.py
tests/models/multimodal/pooling/test_colpali.py
+1
-1
tests/models/multimodal/pooling/test_colqwen3.py
tests/models/multimodal/pooling/test_colqwen3.py
+1
-1
tests/models/multimodal/pooling/test_colqwen3_5.py
tests/models/multimodal/pooling/test_colqwen3_5.py
+154
-0
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
+9
-2
tests/models/multimodal/pooling/test_phi3v.py
tests/models/multimodal/pooling/test_phi3v.py
+7
-0
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
613 of 613+
files are displayed.
Plain diff
Email patch
tests/kernels/quantization/test_mxfp4_triton_ep.py
View file @
0da93439
...
...
@@ -17,89 +17,6 @@ from unittest.mock import MagicMock, patch
import
pytest
import
torch
from
vllm.model_executor.layers.quantization.mxfp4
import
(
Mxfp4Backend
,
Mxfp4MoEMethod
,
)
def
_make_mock_moe_config
(
ep_size
:
int
=
1
)
->
MagicMock
:
"""Create a mock FusedMoEConfig with the given EP size."""
parallel_config
=
MagicMock
()
parallel_config
.
ep_size
=
ep_size
moe_config
=
MagicMock
()
moe_config
.
ep_size
=
ep_size
moe_config
.
is_lora_enabled
=
False
moe_config
.
moe_parallel_config
=
parallel_config
return
moe_config
class
TestMxfp4TritonIsMonolithic
:
"""Verify that is_monolithic is always True for the TRITON backend,
regardless of EP size, since triton_kernel_moe_forward now handles
expert_map remapping internally."""
@
pytest
.
mark
.
parametrize
(
"backend,ep_size,expected_monolithic"
,
[
# TRITON is always monolithic (handles EP via expert_map remapping)
(
Mxfp4Backend
.
TRITON
,
1
,
True
),
(
Mxfp4Backend
.
TRITON
,
2
,
True
),
(
Mxfp4Backend
.
TRITON
,
4
,
True
),
# SM100 backends are always monolithic
(
Mxfp4Backend
.
SM100_FI_MXFP4_MXFP8_TRTLLM
,
1
,
True
),
(
Mxfp4Backend
.
SM100_FI_MXFP4_MXFP8_TRTLLM
,
2
,
True
),
(
Mxfp4Backend
.
SM100_FI_MXFP4_BF16
,
1
,
True
),
(
Mxfp4Backend
.
SM100_FI_MXFP4_BF16
,
2
,
True
),
# MARLIN is never monolithic
(
Mxfp4Backend
.
MARLIN
,
1
,
False
),
(
Mxfp4Backend
.
MARLIN
,
2
,
False
),
],
ids
=
[
"triton-no-ep"
,
"triton-ep2"
,
"triton-ep4"
,
"sm100-trtllm-no-ep"
,
"sm100-trtllm-ep2"
,
"sm100-bf16-no-ep"
,
"sm100-bf16-ep2"
,
"marlin-no-ep"
,
"marlin-ep2"
,
],
)
@
patch
(
"vllm.model_executor.layers.quantization.mxfp4.get_mxfp4_backend"
,
)
@
patch
(
"vllm.model_executor.layers.quantization.mxfp4.get_current_vllm_config"
,
)
def
test_is_monolithic
(
self
,
mock_get_config
,
mock_get_backend
,
backend
,
ep_size
,
expected_monolithic
,
):
"""is_monolithic should be True for TRITON regardless of EP size."""
mock_get_backend
.
return_value
=
backend
mock_compilation_config
=
MagicMock
()
mock_compilation_config
.
max_cudagraph_capture_size
=
1024
mock_vllm_config
=
MagicMock
()
mock_vllm_config
.
compilation_config
=
mock_compilation_config
mock_get_config
.
return_value
=
mock_vllm_config
moe_config
=
_make_mock_moe_config
(
ep_size
=
ep_size
)
method
=
Mxfp4MoEMethod
(
moe_config
)
assert
method
.
is_monolithic
==
expected_monolithic
,
(
f
"Expected is_monolithic=
{
expected_monolithic
}
for "
f
"backend=
{
backend
.
name
}
, ep_size=
{
ep_size
}
, "
f
"but got
{
method
.
is_monolithic
}
."
)
class
TestTritonMoeForwardExpertMap
:
"""Test that triton_kernel_moe_forward applies expert_map remapping
...
...
tests/kernels/quantization/test_rocm_skinny_gemms.py
View file @
0da93439
...
...
@@ -160,6 +160,8 @@ def test_rocm_wvsplitkrc_kernel(xnorm, n, k, m, dtype, seed, padded_a, bias_mode
BIAS
=
torch
.
rand
(
m
,
dtype
=
dtype
,
device
=
"cuda"
)
*
2
-
1
elif
bias_mode
==
2
:
BIAS
=
torch
.
rand
(
n
,
m
,
dtype
=
dtype
,
device
=
"cuda"
)
*
2
-
1
elif
bias_mode
==
3
:
BIAS
=
torch
.
rand
(
1
,
m
,
dtype
=
dtype
,
device
=
"cuda"
)
*
2
-
1
ref_out
=
torch
.
nn
.
functional
.
linear
(
A
,
B
,
BIAS
)
out
=
ops
.
wvSplitKrc
(
A
,
B
,
cu_count
,
BIAS
)
...
...
@@ -224,10 +226,9 @@ def test_rocm_wvsplitk_kernel(
ref_out
=
torch
.
nn
.
functional
.
linear
(
A
,
B
,
BIAS
)
out
=
ops
.
wvSplitK
(
B
,
A
.
view
(
-
1
,
A
.
size
(
-
1
)),
cu_count
,
BIAS
)
if
xnorm
:
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-3
,
rtol
=
1e-8
)
else
:
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-3
,
rtol
=
1e-2
)
# Accumulation error in fp16 GEMM scales with sqrt(K)
atol
=
torch
.
finfo
(
dtype
).
eps
*
math
.
sqrt
(
k
)
torch
.
testing
.
assert_close
(
out
,
ref_out
,
atol
=
atol
,
rtol
=
1e-2
)
@
pytest
.
mark
.
parametrize
(
"xnorm"
,
[
False
,
True
])
...
...
tests/lora/conftest.py
View file @
0da93439
...
...
@@ -294,6 +294,11 @@ def whisper_lora_files():
return
snapshot_download
(
repo_id
=
"chengyili2005/whisper-small-mandarin-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen35_dense_model_lora_files
():
return
snapshot_download
(
repo_id
=
"jeeejeee/qwen35-4b-text-only-sql-lora"
)
@
pytest
.
fixture
def
reset_default_device
():
"""
...
...
tests/lora/test_lora_manager.py
View file @
0da93439
...
...
@@ -711,3 +711,192 @@ def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, devic
torch
.
testing
.
assert_close
(
packed_lora1
.
lora_b
[
1
],
model_lora_clone1
.
get_lora
(
"up_proj"
).
lora_b
)
def
_test_target_modules
(
model
,
target_modules
:
list
[
str
]
|
None
,
device
:
str
,
expected_lora
:
list
[
tuple
[
str
,
type
]],
expected_no_lora
:
list
[
tuple
[
str
,
type
]],
):
"""Create a LoRAModelManager and assert which modules have LoRA applied."""
LoRAModelManager
(
model
,
2
,
2
,
2
,
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
2
,
max_loras
=
2
,
lora_dtype
=
DEFAULT_DTYPE
,
target_modules
=
target_modules
,
),
device
=
device
,
)
for
module_path
,
lora_cls
in
expected_lora
:
assert
isinstance
(
model
.
get_submodule
(
module_path
),
lora_cls
)
for
module_path
,
lora_cls
in
expected_no_lora
:
assert
not
isinstance
(
model
.
get_submodule
(
module_path
),
lora_cls
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_target_modules_config
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
"""Test that target_modules config restricts which modules get LoRA applied."""
_test_target_modules
(
dummy_model
,
[
"dense1"
],
device
,
expected_lora
=
[
(
"dense1"
,
ColumnParallelLinearWithLoRA
),
(
"layer1.dense1"
,
ColumnParallelLinearWithLoRA
),
],
expected_no_lora
=
[
(
"dense2"
,
RowParallelLinearWithLoRA
),
(
"layer1.dense2"
,
RowParallelLinearWithLoRA
),
],
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_target_modules_multiple
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
"""Test that multiple target_modules work correctly."""
_test_target_modules
(
dummy_model
,
[
"dense1"
,
"dense2"
],
device
,
expected_lora
=
[
(
"dense1"
,
ColumnParallelLinearWithLoRA
),
(
"layer1.dense1"
,
ColumnParallelLinearWithLoRA
),
(
"dense2"
,
RowParallelLinearWithLoRA
),
(
"layer1.dense2"
,
RowParallelLinearWithLoRA
),
],
expected_no_lora
=
[],
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_target_modules_none_uses_all
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
"""Test that target_modules=None uses all supported modules."""
_test_target_modules
(
dummy_model
,
None
,
device
,
expected_lora
=
[
(
"dense1"
,
ColumnParallelLinearWithLoRA
),
(
"layer1.dense1"
,
ColumnParallelLinearWithLoRA
),
(
"dense2"
,
RowParallelLinearWithLoRA
),
(
"layer1.dense2"
,
RowParallelLinearWithLoRA
),
],
expected_no_lora
=
[],
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_load_adapter_warns_on_unsupported_modules
(
default_vllm_config
,
dist_init
,
dummy_model_gate_up
,
device
,
tmp_path
):
"""Test that _load_adapter warns when a LoRA adapter contains modules
not in the model's supported LoRA target modules."""
from
unittest.mock
import
patch
import
vllm.lora.worker_manager
as
wm_module
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
)
dummy_lora_files
=
f
"
{
tmp_path
}
/lora_adapter"
os
.
makedirs
(
dummy_lora_files
,
exist_ok
=
True
)
create_peft_lora
(
dummy_model_gate_up
,
save_dir
=
dummy_lora_files
,
target_modules
=
[
"layer1.dense1"
,
"dense2"
],
lora_dtype
=
DEFAULT_DTYPE
,
)
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
lora_config
=
lora_config
)
vllm_config
.
scheduler_config
.
max_num_seqs
=
4
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
2
worker_manager
=
WorkerLoRAManager
(
vllm_config
,
device
,
EMBEDDING_MODULES
)
worker_manager
.
vocab_size
=
dummy_model_gate_up
.
unpadded_vocab_size
worker_manager
.
create_lora_manager
(
dummy_model_gate_up
)
# Patch from_local_checkpoint to inject an unsupported module
original_from_checkpoint
=
LoRAModel
.
from_local_checkpoint
def
patched_from_checkpoint
(
*
args
,
**
kwargs
):
lora
=
original_from_checkpoint
(
*
args
,
**
kwargs
)
lora
.
loras
[
"unsupported_module"
]
=
LoRALayerWeights
(
module_name
=
"unsupported_module"
,
rank
=
8
,
lora_alpha
=
16
,
lora_a
=
torch
.
randn
(
8
,
10
),
lora_b
=
torch
.
randn
(
10
,
8
),
)
return
lora
lora_request
=
LoRARequest
(
"test"
,
1
,
dummy_lora_files
)
with
(
patch
.
object
(
LoRAModel
,
"from_local_checkpoint"
,
patched_from_checkpoint
),
patch
.
object
(
wm_module
.
logger
,
"warning_once"
)
as
mock_warning
,
):
worker_manager
.
_load_adapter
(
lora_request
)
warning_args
=
mock_warning
.
call_args_list
found
=
any
(
"unsupported_module"
in
str
(
call
)
for
call
in
warning_args
)
assert
found
,
(
f
"Expected warning about 'unsupported_module', got:
{
warning_args
}
"
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_load_adapter_warns_on_target_modules_restriction
(
default_vllm_config
,
dist_init
,
dummy_model_gate_up
,
device
,
tmp_path
):
"""Test that _load_adapter warns when a LoRA adapter contains modules
excluded by the deployment-time target_modules restriction."""
from
unittest.mock
import
patch
import
vllm.lora.worker_manager
as
wm_module
# Restrict to only dense2 — adapter has dense1 which will be excluded
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
,
target_modules
=
[
"dense2"
],
)
dummy_lora_files
=
f
"
{
tmp_path
}
/lora_adapter"
os
.
makedirs
(
dummy_lora_files
,
exist_ok
=
True
)
create_peft_lora
(
dummy_model_gate_up
,
save_dir
=
dummy_lora_files
,
target_modules
=
[
"layer1.dense1"
,
"dense2"
],
lora_dtype
=
DEFAULT_DTYPE
,
)
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
lora_config
=
lora_config
)
vllm_config
.
scheduler_config
.
max_num_seqs
=
4
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
2
worker_manager
=
WorkerLoRAManager
(
vllm_config
,
device
,
EMBEDDING_MODULES
)
worker_manager
.
vocab_size
=
dummy_model_gate_up
.
unpadded_vocab_size
worker_manager
.
create_lora_manager
(
dummy_model_gate_up
)
lora_request
=
LoRARequest
(
"test"
,
1
,
dummy_lora_files
)
with
patch
.
object
(
wm_module
.
logger
,
"warning_once"
)
as
mock_warning
:
worker_manager
.
_load_adapter
(
lora_request
)
warning_args
=
mock_warning
.
call_args_list
# dense1 is supported by the model but excluded by target_modules
found
=
any
(
"target_modules"
in
str
(
call
)
for
call
in
warning_args
)
assert
found
,
(
f
"Expected warning about target_modules restriction, got:
{
warning_args
}
"
)
tests/lora/test_lora_utils.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.lora.utils
import
is_in_target_modules
,
is_supported_lora_module
class
TestIsSupportedLoraModule
:
"""Tests for is_supported_lora_module (model-definition check)."""
def
test_suffix_match
(
self
):
assert
is_supported_lora_module
(
"model.layers.0.self_attn.o_proj"
,
[
"o_proj"
,
"q_proj"
]
)
def
test_no_match
(
self
):
assert
not
is_supported_lora_module
(
"model.layers.0.self_attn.o_proj"
,
[
"q_proj"
,
"k_proj"
]
)
def
test_exact_match
(
self
):
assert
is_supported_lora_module
(
"o_proj"
,
[
"o_proj"
])
def
test_regex_suffix_matching
(
self
):
"""Regex anchors to end — partial suffix should not match."""
assert
not
is_supported_lora_module
(
"model.layers.0.self_attn.o_proj"
,
[
"proj"
])
def
test_empty_supported_modules
(
self
):
assert
not
is_supported_lora_module
(
"model.layers.0.self_attn.o_proj"
,
[])
def
test_multiple_supported_modules
(
self
):
supported
=
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
"o_proj"
]
assert
is_supported_lora_module
(
"model.layers.0.self_attn.v_proj"
,
supported
)
assert
not
is_supported_lora_module
(
"model.layers.0.mlp.gate_proj"
,
supported
)
class
TestIsInTargetModules
:
"""Tests for is_in_target_modules (deployment-time filter)."""
def
test_none_allows_all
(
self
):
assert
is_in_target_modules
(
"model.layers.0.self_attn.o_proj"
,
None
)
def
test_suffix_in_target
(
self
):
assert
is_in_target_modules
(
"model.layers.0.self_attn.o_proj"
,
[
"o_proj"
,
"q_proj"
]
)
def
test_suffix_not_in_target
(
self
):
assert
not
is_in_target_modules
(
"model.layers.0.self_attn.o_proj"
,
[
"q_proj"
,
"k_proj"
]
)
def
test_empty_target_modules
(
self
):
assert
not
is_in_target_modules
(
"model.layers.0.self_attn.o_proj"
,
[])
def
test_exact_name_match
(
self
):
assert
is_in_target_modules
(
"dense1"
,
[
"dense1"
,
"dense2"
])
def
test_exact_name_no_match
(
self
):
assert
not
is_in_target_modules
(
"dense3"
,
[
"dense1"
,
"dense2"
])
tests/lora/test_qwen35_densemoel_lora.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
transformers
import
AutoTokenizer
import
vllm
import
vllm.config
from
vllm.lora.request
import
LoRARequest
from
..utils
import
create_new_process_for_each_test
,
multi_gpu_test
MODEL_PATH
=
"Qwen/Qwen3.5-4B"
PROMPT_TEMPLATE
=
"""Write a SQL query for the given database.
\n
Schema:
\n
Tables:
\n
- stadium(Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average)
\n
- singer(Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male)
\n
- concert(concert_ID, concert_Name, Theme, Stadium_ID, Year)
\n
- singer_in_concert(concert_ID, Singer_ID)
\n\n
Question:
\n
{query}"""
# noqa: E501
EXPECTED_LORA_OUTPUT
=
[
"SELECT count(*) FROM singer"
,
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'"
,
"SELECT name FROM stadium WHERE stadium_id NOT IN (SELECT stadium_id FROM concert)"
,
]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_PATH
,
trust_remote_code
=
True
)
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
PROMPT_TEMPLATE
.
format
(
query
=
"How many singers do we have?"
),
PROMPT_TEMPLATE
.
format
(
query
=
(
"What is the average, minimum, and maximum "
"age of all singers from France?"
)
),
PROMPT_TEMPLATE
.
format
(
query
=
(
"What are the names of the stadiums without any concerts?"
)
),
]
input_templates
=
[]
for
prmpt
in
prompts
:
messages
=
[{
"role"
:
"user"
,
"content"
:
prmpt
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
,
enable_thinking
=
False
,
# disable thinking
)
input_templates
.
append
(
prompt
)
sampling_params
=
vllm
.
SamplingParams
(
temperature
=
0
,
max_tokens
=
512
)
outputs
=
llm
.
generate
(
input_templates
,
sampling_params
,
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
lora_path
)
if
lora_id
else
None
,
)
generated_texts
:
list
[
str
]
=
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
.
strip
()
generated_texts
.
append
(
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
return
generated_texts
@
create_new_process_for_each_test
()
def
test_qwen35_dense_model_lora
(
qwen35_dense_model_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
512
,
enable_lora
=
True
,
max_loras
=
2
,
max_num_seqs
=
16
,
max_lora_rank
=
8
,
trust_remote_code
=
True
,
)
output1
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
1
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output1
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
output2
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
2
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output2
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
@
multi_gpu_test
(
num_gpus
=
4
)
def
test_qwen35_dense_model_lora_tp4
(
qwen35_dense_model_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
enable_lora
=
True
,
max_loras
=
2
,
max_lora_rank
=
8
,
max_num_seqs
=
16
,
tensor_parallel_size
=
4
,
trust_remote_code
=
True
,
fully_sharded_loras
=
False
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
output1
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
1
)
print
(
output1
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output1
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
output2
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
2
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output2
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
@
multi_gpu_test
(
num_gpus
=
4
)
def
test_qwen35_dense_model_lora_tp4_fully_sharded_loras
(
qwen35_dense_model_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
512
,
enable_lora
=
True
,
max_loras
=
2
,
max_lora_rank
=
8
,
tensor_parallel_size
=
4
,
trust_remote_code
=
True
,
fully_sharded_loras
=
True
,
gpu_memory_utilization
=
0.8
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
output1
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
1
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output1
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
output2
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
2
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output2
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
tests/model_executor/layers/test_rocm_unquantized_gemm.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
unittest.mock
import
MagicMock
import
pytest
import
torch
from
vllm.platforms
import
current_platform
if
current_platform
.
is_cuda
():
pytest
.
skip
(
"ROCm skinny GEMM tests are not supported on CUDA."
,
allow_module_level
=
True
,
)
from
vllm.model_executor.layers
import
utils
def
test_rocm_unquantized_gemm_gfx1x_wvsplitk_path
(
monkeypatch
):
x
=
torch
.
randn
(
1
,
64
,
dtype
=
torch
.
float16
)
weight
=
torch
.
randn
(
128
,
64
,
dtype
=
torch
.
float16
)
monkeypatch
.
setattr
(
utils
,
"use_aiter_triton_gemm"
,
lambda
*
args
:
False
)
monkeypatch
.
setattr
(
utils
.
envs
,
"VLLM_ROCM_USE_SKINNY_GEMM"
,
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx1x"
,
lambda
:
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx9"
,
lambda
:
False
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx950"
,
lambda
:
False
)
monkeypatch
.
setattr
(
utils
,
"get_cu_count"
,
lambda
:
120
)
wvsplitk_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
,
__
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"wvSplitK"
,
wvsplitk_mock
)
llmm1_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"LLMM1"
,
llmm1_mock
)
out
=
utils
.
rocm_unquantized_gemm_impl
(
x
,
weight
,
None
)
ref
=
torch
.
nn
.
functional
.
linear
(
x
,
weight
,
None
)
wvsplitk_mock
.
assert_called_once
()
llmm1_mock
.
assert_not_called
()
assert
torch
.
allclose
(
out
,
ref
,
atol
=
1e-3
,
rtol
=
1e-3
)
def
test_rocm_unquantized_gemm_gfx1x_n_gt_4_falls_back
(
monkeypatch
):
x
=
torch
.
randn
(
5
,
64
,
dtype
=
torch
.
float16
)
weight
=
torch
.
randn
(
128
,
64
,
dtype
=
torch
.
float16
)
monkeypatch
.
setattr
(
utils
,
"use_aiter_triton_gemm"
,
lambda
*
args
:
False
)
monkeypatch
.
setattr
(
utils
.
envs
,
"VLLM_ROCM_USE_SKINNY_GEMM"
,
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx1x"
,
lambda
:
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx9"
,
lambda
:
False
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx950"
,
lambda
:
False
)
monkeypatch
.
setattr
(
utils
,
"get_cu_count"
,
lambda
:
120
)
wvsplitk_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
,
__
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"wvSplitK"
,
wvsplitk_mock
)
llmm1_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"LLMM1"
,
llmm1_mock
)
out
=
utils
.
rocm_unquantized_gemm_impl
(
x
,
weight
,
None
)
ref
=
torch
.
nn
.
functional
.
linear
(
x
,
weight
,
None
)
wvsplitk_mock
.
assert_not_called
()
llmm1_mock
.
assert_not_called
()
assert
torch
.
allclose
(
out
,
ref
,
atol
=
1e-3
,
rtol
=
1e-3
)
def
test_rocm_unquantized_gemm_gfx950_wvsplitkrc_path
(
monkeypatch
):
x
=
torch
.
randn
(
16
,
1024
,
dtype
=
torch
.
float16
)
weight
=
torch
.
randn
(
256
,
1024
,
dtype
=
torch
.
float16
)
monkeypatch
.
setattr
(
utils
,
"use_aiter_triton_gemm"
,
lambda
*
args
:
False
)
monkeypatch
.
setattr
(
utils
.
envs
,
"VLLM_ROCM_USE_SKINNY_GEMM"
,
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx1x"
,
lambda
:
False
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx9"
,
lambda
:
False
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx950"
,
lambda
:
True
)
monkeypatch
.
setattr
(
utils
,
"get_cu_count"
,
lambda
:
120
)
wvsplitkrc_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
,
__
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"wvSplitKrc"
,
wvsplitkrc_mock
)
wvsplitk_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
,
__
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"wvSplitK"
,
wvsplitk_mock
)
out
=
utils
.
rocm_unquantized_gemm_impl
(
x
,
weight
,
None
)
ref
=
torch
.
nn
.
functional
.
linear
(
x
,
weight
,
None
)
wvsplitkrc_mock
.
assert_called_once
()
wvsplitk_mock
.
assert_not_called
()
assert
torch
.
allclose
(
out
,
ref
,
atol
=
1e-3
,
rtol
=
1e-3
)
tests/models/language/pooling/test_colbert.py
View file @
0da93439
...
...
@@ -59,6 +59,22 @@ COLBERT_MODELS = {
"model_cls"
:
"AutoModel"
,
},
},
"lfm2"
:
{
"model"
:
"LiquidAI/LFM2-ColBERT-350M"
,
"colbert_dim"
:
128
,
"max_model_len"
:
511
,
"extra_kwargs"
:
{
"hf_overrides"
:
{
"architectures"
:
[
"ColBERTLfm2Model"
],
},
},
"hf_comparison"
:
{
"weights_file"
:
"1_Dense/model.safetensors"
,
"weights_key"
:
"linear.weight"
,
"trust_remote_code"
:
False
,
"model_cls"
:
"AutoModel"
,
},
},
}
...
...
tests/models/multimodal/generation/test_common.py
View file @
0da93439
...
...
@@ -220,7 +220,10 @@ VLM_TEST_SETTINGS = {
vllm_runner_kwargs
=
{
"model_impl"
:
"transformers"
,
},
marks
=
[
pytest
.
mark
.
core_model
],
marks
=
[
pytest
.
mark
.
core_model
,
*
([
large_gpu_mark
(
min_gb
=
80
)]
if
current_platform
.
is_rocm
()
else
[]),
],
),
"idefics3-transformers"
:
VLMTestInfo
(
models
=
[
"HuggingFaceTB/SmolVLM-256M-Instruct"
],
...
...
tests/models/multimodal/generation/test_granite_speech.py
View file @
0da93439
...
...
@@ -39,7 +39,11 @@ models = [MODEL_NAME]
def
granite_speech_attention_config
():
"""Return attention config for Granite Speech tests on ROCm."""
if
current_platform
.
is_rocm
():
return
{
"backend"
:
"ROCM_AITER_FA"
}
from
vllm.platforms.rocm
import
on_mi3xx
if
on_mi3xx
():
return
{
"backend"
:
"ROCM_AITER_FA"
}
return
{
"backend"
:
"TRITON_ATTN"
}
return
None
...
...
tests/models/multimodal/generation/test_keye.py
View file @
0da93439
...
...
@@ -24,12 +24,8 @@ class ModelRequestData(NamedTuple):
sampling_params
:
SamplingParams
|
None
=
None
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"question"
,
[
QUESTION
])
def
test_keye_vl
(
image_assets
,
question
:
str
,
):
def
test_keye_vl
(
image_assets
,
question
:
str
):
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
image_urls
=
[
encode_image_url
(
image
)
for
image
in
images
]
...
...
tests/models/multimodal/generation/test_nemotron_parse.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
from
collections.abc
import
Iterable
,
Sequence
import
pytest
import
regex
as
re
from
transformers
import
AutoModel
from
tests.models.utils
import
check_logprobs_close
from
vllm.assets.image
import
ImageAsset
from
vllm.logprobs
import
Logprob
,
SampleLogprobs
from
vllm.tokenizers
import
TokenizerLike
from
....conftest
import
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
create_new_process_for_each_test
IMAGE
=
ImageAsset
(
"paper-11"
).
pil_image_ext
(
ext
=
"png"
).
convert
(
"RGB"
)
PROMPT
=
"</s><s><predict_bbox><predict_classes><output_markdown>"
class
DummyLogprobs
(
dict
[
int
,
Logprob
]):
def
__init__
(
self
,
vocab_ids
:
Iterable
[
int
]):
super
().
__init__
(
dict
.
fromkeys
(
vocab_ids
,
Logprob
(
0.0
)))
def
__repr__
(
self
):
return
"DummyLogprobs()"
def
mask_bbox_tokens
(
output
:
tuple
[
list
[
int
],
str
,
SampleLogprobs
],
tokenizer
:
TokenizerLike
,
)
->
tuple
[
list
[
int
],
str
,
SampleLogprobs
]:
"""
Always pass check_logprobs_close check for bounding box tokens
because it is reasonable for them to differ slightly.
"""
ignore_pattern
=
r
"<[xy]_[\d.]+>"
vocab
=
tokenizer
.
get_vocab
()
output_ids
,
output_str
,
out_logprobs
=
output
masked_logprobs
=
list
[
dict
[
int
,
Logprob
]]()
for
token
,
logprobs
in
zip
(
output_ids
,
out_logprobs
):
if
re
.
match
(
ignore_pattern
,
tokenizer
.
decode
(
token
)):
masked_logprobs
.
append
(
DummyLogprobs
(
vocab
.
values
()))
else
:
masked_logprobs
.
append
(
logprobs
)
return
output_ids
,
output_str
,
masked_logprobs
def
run_test
(
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
...
...
@@ -44,6 +76,8 @@ def run_test(
for
prompts
,
images
in
inputs
]
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
...
...
@@ -58,18 +92,20 @@ def run_test(
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
outputs_0_lst
=
[
mask_bbox_tokens
(
output
,
tokenizer
)
for
output
in
hf_outputs
],
outputs_1_lst
=
[
mask_bbox_tokens
(
output
,
tokenizer
)
for
output
in
vllm_outputs
],
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"nvidia/NVIDIA-Nemotron-Parse-v1.1"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
create_new_process_for_each_test
(
"spawn"
)
def
test_models
(
hf_runner
,
vllm_runner
,
model
:
str
,
dtype
:
str
,
num_logprobs
:
int
)
->
None
:
...
...
@@ -77,10 +113,7 @@ def test_models(
hf_runner
,
vllm_runner
,
inputs
=
[
(
[
PROMPT
]
*
10
,
[
IMAGE
]
*
10
,
),
([
PROMPT
]
*
10
,
[
IMAGE
]
*
10
),
],
model
=
model
,
dtype
=
dtype
,
...
...
tests/models/multimodal/generation/vlm_utils/builders.py
View file @
0da93439
...
...
@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
)
resampler
=
AudioResampler
(
target_sr
=
16000
,
method
=
"librosa"
,
)
resampler
=
AudioResampler
(
target_sr
=
16000
)
audios
=
[
asset
.
audio_and_sample_rate
for
asset
in
audio_assets
]
resampled_audios
=
[
(
...
...
tests/models/multimodal/generation/vlm_utils/model_utils.py
View file @
0da93439
...
...
@@ -24,6 +24,7 @@ from transformers import (
GenerationConfig
,
GenerationMixin
,
)
from
transformers.masking_utils
import
create_causal_mask
from
transformers.video_utils
import
VideoMetadata
from
vllm.logprobs
import
SampleLogprobs
...
...
@@ -489,13 +490,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Image
|
list
[
Image
],
**
kwargs
):
from
vllm.model_executor.models.h2ovl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
from
vllm.transformers_utils.processors.h2ovl
import
(
image_to_pixel_values_h2ovl
,
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
image_to_pixel_values_h2ovl
(
...
...
@@ -679,10 +681,14 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
sin
=
sin
.
to
(
inputs_embeds
.
dtype
)
# Prepare attention mask
if
attention_mask
is
not
None
:
attention_mask
=
self
.
_update_causal_mask
(
attention_mask
,
inputs_embeds
,
cache_position
,
past_key_values
,
False
)
attention_mask
=
create_causal_mask
(
config
=
self
.
config
,
input_embeds
=
inputs_embeds
,
attention_mask
=
attention_mask
,
past_key_values
=
past_key_values
,
position_ids
=
position_ids
,
cache_position
=
cache_position
,
)
# Initialize and collect hidden states
hidden_states
=
inputs_embeds
...
...
@@ -751,16 +757,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Image
|
list
[
Image
],
**
kwargs
):
from
vllm.model_executor.models.skyworkr1v
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values_skyworkr1v
,
from
vllm.transformers_utils.processors.internvl
import
(
image_to_pixel_values_internvl
,
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
image_to_pixel_values_
skyworkr1v
(
image_to_pixel_values_
internvl
(
image
,
input_size
=
self
.
image_size
,
min_num
=
self
.
min_num
,
...
...
@@ -815,14 +822,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
]
=
None
,
**
kwargs
,
):
from
vllm.model_executor.models.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
from
vllm.transformers_utils.processors.internvl
import
(
image_to_pixel_values_internvl
,
video_to_pixel_values_internvl
,
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
videos
=
[
videos
]
if
isinstance
(
videos
,
np
.
ndarray
)
else
videos
if
images
is
not
None
:
...
...
@@ -1260,9 +1268,9 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner":
generated).
"""
import
base64
import
io
import
pybase64
as
base64
import
soundfile
as
sf
processor
=
hf_model
.
processor
...
...
tests/models/multimodal/pooling/test_colpali.py
View file @
0da93439
...
...
@@ -7,9 +7,9 @@ ColPali is a multi-vector retrieval model based on PaliGemma backbone
It produces per-token embeddings for both text and image inputs.
"""
import
base64
from
io
import
BytesIO
import
pybase64
as
base64
import
pytest
import
torch
from
PIL
import
Image
...
...
tests/models/multimodal/pooling/test_colqwen3.py
View file @
0da93439
...
...
@@ -7,9 +7,9 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token
embeddings for both text and image inputs.
"""
import
base64
from
io
import
BytesIO
import
pybase64
as
base64
import
pytest
import
torch
from
PIL
import
Image
...
...
tests/models/multimodal/pooling/test_colqwen3_5.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for ColQwen3.5 late interaction model for multi-modal retrieval.
ColQwen3.5 is a multi-vector retrieval model based on Qwen3.5 backbone with
ColBERT-style late interaction scoring (MaxSim). It produces per-token
embeddings for both text and image inputs.
"""
import
pytest
import
torch
from
....conftest
import
VllmRunner
MODELS
=
[
"athrael-soju/colqwen3.5-4.5B-v3"
,
]
EMBED_DIMS
=
{
"athrael-soju/colqwen3.5-4.5B-v3"
:
320
,
}
TEXT_QUERIES
=
[
"What is the capital of France?"
,
"Describe the contents of the document."
,
]
TEXT_DOCUMENTS
=
[
"The capital of France is Paris."
,
"This document contains important financial data."
,
]
DTYPE
=
"half"
def
_run_token_embed_test
(
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
*
,
dtype
:
str
,
)
->
None
:
"""Verify per-token embedding shape and L2 normalization."""
with
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
dtype
,
max_model_len
=
4096
,
enforce_eager
=
True
,
)
as
vllm_model
:
outputs
=
vllm_model
.
token_embed
([
TEXT_QUERIES
[
0
]])
assert
len
(
outputs
)
==
1
emb
=
torch
.
tensor
(
outputs
[
0
])
# Token embeddings should be 2D: [num_tokens, embed_dim]
assert
emb
.
dim
()
==
2
assert
emb
.
shape
[
1
]
==
EMBED_DIMS
[
model
]
assert
emb
.
shape
[
0
]
>
1
# Verify L2 normalization
norms
=
torch
.
norm
(
emb
,
p
=
2
,
dim
=-
1
)
torch
.
testing
.
assert_close
(
norms
,
torch
.
ones_like
(
norms
),
rtol
=
1e-2
,
atol
=
1e-2
,
)
def
_run_late_interaction_test
(
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
*
,
dtype
:
str
,
)
->
None
:
"""Verify MaxSim scoring matches manual computation."""
from
vllm.entrypoints.pooling.score.utils
import
compute_maxsim_score
with
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
dtype
,
max_model_len
=
4096
,
enforce_eager
=
True
,
)
as
vllm_model
:
q_outputs
=
vllm_model
.
token_embed
([
TEXT_QUERIES
[
0
]])
d_outputs
=
vllm_model
.
token_embed
([
TEXT_DOCUMENTS
[
0
]])
q_emb
=
torch
.
tensor
(
q_outputs
[
0
])
d_emb
=
torch
.
tensor
(
d_outputs
[
0
])
manual_score
=
compute_maxsim_score
(
q_emb
,
d_emb
).
item
()
vllm_scores
=
vllm_model
.
score
(
TEXT_QUERIES
[
0
],
TEXT_DOCUMENTS
[
0
])
assert
len
(
vllm_scores
)
==
1
assert
vllm_scores
[
0
]
==
pytest
.
approx
(
manual_score
,
rel
=
0.01
)
def
_run_relevance_test
(
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
*
,
dtype
:
str
,
)
->
None
:
"""Verify that relevant documents score higher than irrelevant ones."""
query
=
"What is machine learning?"
documents
=
[
"Machine learning is a subset of artificial intelligence."
,
"The weather forecast shows rain tomorrow."
,
"Deep learning uses neural networks for complex tasks."
,
]
with
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
dtype
,
max_model_len
=
4096
,
enforce_eager
=
True
,
)
as
vllm_model
:
scores
=
vllm_model
.
score
(
query
,
documents
)
assert
len
(
scores
)
==
3
assert
scores
[
0
]
>
scores
[
1
],
"ML doc should score higher than weather doc"
assert
scores
[
2
]
>
scores
[
1
],
"DL doc should score higher than weather doc"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
DTYPE
])
def
test_colqwen3_5_token_embed
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
_run_token_embed_test
(
vllm_runner
,
model
,
dtype
=
dtype
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
DTYPE
])
def
test_colqwen3_5_late_interaction_scoring
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
_run_late_interaction_test
(
vllm_runner
,
model
,
dtype
=
dtype
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
DTYPE
])
def
test_colqwen3_5_relevance_ordering
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
_run_relevance_test
(
vllm_runner
,
model
,
dtype
=
dtype
)
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
View file @
0da93439
...
...
@@ -9,10 +9,10 @@ Tests for the LlamaNemotronVL model family:
Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone.
"""
import
base64
from
io
import
BytesIO
from
pathlib
import
Path
import
pybase64
as
base64
import
pytest
import
torch
from
transformers
import
AutoModel
,
AutoModelForSequenceClassification
,
AutoProcessor
...
...
@@ -22,8 +22,10 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartTextParam
,
)
from
vllm.entrypoints.pooling.score.utils
import
ScoreMultiModalParam
from
vllm.platforms
import
current_platform
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
ROCM_ENGINE_KWARGS
from
...utils
import
check_embeddings_close
# Prefixes used by the model API
...
...
@@ -70,6 +72,7 @@ def _run_test(
max_model_len
=
2048
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
**
ROCM_ENGINE_KWARGS
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
)
...
...
@@ -250,6 +253,7 @@ def _run_vllm_reranker(
max_model_len
=
2048
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
**
ROCM_ENGINE_KWARGS
,
)
as
vllm_model
:
has_images
=
any
(
img
is
not
None
for
_
,
img
in
docs
)
...
...
@@ -322,8 +326,11 @@ def _run_reranker_test(
assert
len
(
hf_scores
)
==
len
(
vllm_scores
),
(
f
"Output length mismatch: HF=
{
len
(
hf_scores
)
}
, vLLM=
{
len
(
vllm_scores
)
}
"
)
# NOTE: ROCm shows slightly higher numerical variance dues to different attention
# backend between vLLM and HF; use a marginally looser tolerance
rel_tol
=
0.022
if
current_platform
.
is_rocm
()
else
0.02
for
i
,
(
hf_score
,
vllm_score
)
in
enumerate
(
zip
(
hf_scores
,
vllm_scores
)):
assert
hf_score
==
pytest
.
approx
(
vllm_score
,
rel
=
0.02
),
(
assert
hf_score
==
pytest
.
approx
(
vllm_score
,
rel
=
rel_tol
),
(
f
"Score mismatch at index
{
i
}
: HF=
{
hf_score
:.
4
f
}
, vLLM=
{
vllm_score
:.
4
f
}
"
)
...
...
tests/models/multimodal/pooling/test_phi3v.py
View file @
0da93439
...
...
@@ -3,6 +3,7 @@
import
pytest
import
torch.nn.functional
as
F
import
transformers.utils
from
PIL
import
Image
from
vllm.assets.base
import
get_vllm_public_assets
...
...
@@ -12,6 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from
....utils
import
large_gpu_test
from
...utils
import
check_embeddings_close
# BC for method that was deleted in Transformers v5.
# Only needed for generating the HF reference.
transformers
.
utils
.
is_flash_attn_greater_or_equal_2_10
=
(
lambda
:
transformers
.
utils
.
is_flash_attn_greater_or_equal
(
"2.1.0"
)
)
HF_TEXT_PROMPTS
=
[
# T -> X
"Find me an everyday image that matches the given caption: The label of the object is stop sign"
,
# noqa: E501
...
...
tests/models/multimodal/processing/test_h2ovl.py
View file @
0da93439
...
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
max_num
:
int
,
):
from
vllm.
model_executor.model
s.h2ovl
import
(
from
vllm.
transformers_utils.processor
s.h2ovl
import
(
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
,
)
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment