Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0da93439
Commit
0da93439
authored
Mar 26, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori
parents
25f2f756
298e5108
Changes
613
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
751 additions
and
133 deletions
+751
-133
tests/kernels/quantization/test_mxfp4_triton_ep.py
tests/kernels/quantization/test_mxfp4_triton_ep.py
+0
-83
tests/kernels/quantization/test_rocm_skinny_gemms.py
tests/kernels/quantization/test_rocm_skinny_gemms.py
+5
-4
tests/lora/conftest.py
tests/lora/conftest.py
+5
-0
tests/lora/test_lora_manager.py
tests/lora/test_lora_manager.py
+189
-0
tests/lora/test_lora_utils.py
tests/lora/test_lora_utils.py
+60
-0
tests/lora/test_qwen35_densemoel_lora.py
tests/lora/test_qwen35_densemoel_lora.py
+132
-0
tests/model_executor/layers/test_rocm_unquantized_gemm.py
tests/model_executor/layers/test_rocm_unquantized_gemm.py
+89
-0
tests/models/language/pooling/test_colbert.py
tests/models/language/pooling/test_colbert.py
+16
-0
tests/models/multimodal/generation/test_common.py
tests/models/multimodal/generation/test_common.py
+4
-1
tests/models/multimodal/generation/test_granite_speech.py
tests/models/multimodal/generation/test_granite_speech.py
+5
-1
tests/models/multimodal/generation/test_keye.py
tests/models/multimodal/generation/test_keye.py
+1
-5
tests/models/multimodal/generation/test_nemotron_parse.py
tests/models/multimodal/generation/test_nemotron_parse.py
+44
-11
tests/models/multimodal/generation/vlm_utils/builders.py
tests/models/multimodal/generation/vlm_utils/builders.py
+1
-4
tests/models/multimodal/generation/vlm_utils/model_utils.py
tests/models/multimodal/generation/vlm_utils/model_utils.py
+27
-19
tests/models/multimodal/pooling/test_colpali.py
tests/models/multimodal/pooling/test_colpali.py
+1
-1
tests/models/multimodal/pooling/test_colqwen3.py
tests/models/multimodal/pooling/test_colqwen3.py
+1
-1
tests/models/multimodal/pooling/test_colqwen3_5.py
tests/models/multimodal/pooling/test_colqwen3_5.py
+154
-0
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
+9
-2
tests/models/multimodal/pooling/test_phi3v.py
tests/models/multimodal/pooling/test_phi3v.py
+7
-0
tests/models/multimodal/processing/test_h2ovl.py
tests/models/multimodal/processing/test_h2ovl.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
613 of 613+
files are displayed.
Plain diff
Email patch
tests/kernels/quantization/test_mxfp4_triton_ep.py
View file @
0da93439
...
@@ -17,89 +17,6 @@ from unittest.mock import MagicMock, patch
...
@@ -17,89 +17,6 @@ from unittest.mock import MagicMock, patch
import
pytest
import
pytest
import
torch
import
torch
from
vllm.model_executor.layers.quantization.mxfp4
import
(
Mxfp4Backend
,
Mxfp4MoEMethod
,
)
def
_make_mock_moe_config
(
ep_size
:
int
=
1
)
->
MagicMock
:
"""Create a mock FusedMoEConfig with the given EP size."""
parallel_config
=
MagicMock
()
parallel_config
.
ep_size
=
ep_size
moe_config
=
MagicMock
()
moe_config
.
ep_size
=
ep_size
moe_config
.
is_lora_enabled
=
False
moe_config
.
moe_parallel_config
=
parallel_config
return
moe_config
class
TestMxfp4TritonIsMonolithic
:
"""Verify that is_monolithic is always True for the TRITON backend,
regardless of EP size, since triton_kernel_moe_forward now handles
expert_map remapping internally."""
@
pytest
.
mark
.
parametrize
(
"backend,ep_size,expected_monolithic"
,
[
# TRITON is always monolithic (handles EP via expert_map remapping)
(
Mxfp4Backend
.
TRITON
,
1
,
True
),
(
Mxfp4Backend
.
TRITON
,
2
,
True
),
(
Mxfp4Backend
.
TRITON
,
4
,
True
),
# SM100 backends are always monolithic
(
Mxfp4Backend
.
SM100_FI_MXFP4_MXFP8_TRTLLM
,
1
,
True
),
(
Mxfp4Backend
.
SM100_FI_MXFP4_MXFP8_TRTLLM
,
2
,
True
),
(
Mxfp4Backend
.
SM100_FI_MXFP4_BF16
,
1
,
True
),
(
Mxfp4Backend
.
SM100_FI_MXFP4_BF16
,
2
,
True
),
# MARLIN is never monolithic
(
Mxfp4Backend
.
MARLIN
,
1
,
False
),
(
Mxfp4Backend
.
MARLIN
,
2
,
False
),
],
ids
=
[
"triton-no-ep"
,
"triton-ep2"
,
"triton-ep4"
,
"sm100-trtllm-no-ep"
,
"sm100-trtllm-ep2"
,
"sm100-bf16-no-ep"
,
"sm100-bf16-ep2"
,
"marlin-no-ep"
,
"marlin-ep2"
,
],
)
@
patch
(
"vllm.model_executor.layers.quantization.mxfp4.get_mxfp4_backend"
,
)
@
patch
(
"vllm.model_executor.layers.quantization.mxfp4.get_current_vllm_config"
,
)
def
test_is_monolithic
(
self
,
mock_get_config
,
mock_get_backend
,
backend
,
ep_size
,
expected_monolithic
,
):
"""is_monolithic should be True for TRITON regardless of EP size."""
mock_get_backend
.
return_value
=
backend
mock_compilation_config
=
MagicMock
()
mock_compilation_config
.
max_cudagraph_capture_size
=
1024
mock_vllm_config
=
MagicMock
()
mock_vllm_config
.
compilation_config
=
mock_compilation_config
mock_get_config
.
return_value
=
mock_vllm_config
moe_config
=
_make_mock_moe_config
(
ep_size
=
ep_size
)
method
=
Mxfp4MoEMethod
(
moe_config
)
assert
method
.
is_monolithic
==
expected_monolithic
,
(
f
"Expected is_monolithic=
{
expected_monolithic
}
for "
f
"backend=
{
backend
.
name
}
, ep_size=
{
ep_size
}
, "
f
"but got
{
method
.
is_monolithic
}
."
)
class
TestTritonMoeForwardExpertMap
:
class
TestTritonMoeForwardExpertMap
:
"""Test that triton_kernel_moe_forward applies expert_map remapping
"""Test that triton_kernel_moe_forward applies expert_map remapping
...
...
tests/kernels/quantization/test_rocm_skinny_gemms.py
View file @
0da93439
...
@@ -160,6 +160,8 @@ def test_rocm_wvsplitkrc_kernel(xnorm, n, k, m, dtype, seed, padded_a, bias_mode
...
@@ -160,6 +160,8 @@ def test_rocm_wvsplitkrc_kernel(xnorm, n, k, m, dtype, seed, padded_a, bias_mode
BIAS
=
torch
.
rand
(
m
,
dtype
=
dtype
,
device
=
"cuda"
)
*
2
-
1
BIAS
=
torch
.
rand
(
m
,
dtype
=
dtype
,
device
=
"cuda"
)
*
2
-
1
elif
bias_mode
==
2
:
elif
bias_mode
==
2
:
BIAS
=
torch
.
rand
(
n
,
m
,
dtype
=
dtype
,
device
=
"cuda"
)
*
2
-
1
BIAS
=
torch
.
rand
(
n
,
m
,
dtype
=
dtype
,
device
=
"cuda"
)
*
2
-
1
elif
bias_mode
==
3
:
BIAS
=
torch
.
rand
(
1
,
m
,
dtype
=
dtype
,
device
=
"cuda"
)
*
2
-
1
ref_out
=
torch
.
nn
.
functional
.
linear
(
A
,
B
,
BIAS
)
ref_out
=
torch
.
nn
.
functional
.
linear
(
A
,
B
,
BIAS
)
out
=
ops
.
wvSplitKrc
(
A
,
B
,
cu_count
,
BIAS
)
out
=
ops
.
wvSplitKrc
(
A
,
B
,
cu_count
,
BIAS
)
...
@@ -224,10 +226,9 @@ def test_rocm_wvsplitk_kernel(
...
@@ -224,10 +226,9 @@ def test_rocm_wvsplitk_kernel(
ref_out
=
torch
.
nn
.
functional
.
linear
(
A
,
B
,
BIAS
)
ref_out
=
torch
.
nn
.
functional
.
linear
(
A
,
B
,
BIAS
)
out
=
ops
.
wvSplitK
(
B
,
A
.
view
(
-
1
,
A
.
size
(
-
1
)),
cu_count
,
BIAS
)
out
=
ops
.
wvSplitK
(
B
,
A
.
view
(
-
1
,
A
.
size
(
-
1
)),
cu_count
,
BIAS
)
if
xnorm
:
# Accumulation error in fp16 GEMM scales with sqrt(K)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-3
,
rtol
=
1e-8
)
atol
=
torch
.
finfo
(
dtype
).
eps
*
math
.
sqrt
(
k
)
else
:
torch
.
testing
.
assert_close
(
out
,
ref_out
,
atol
=
atol
,
rtol
=
1e-2
)
assert
torch
.
allclose
(
out
,
ref_out
,
atol
=
1e-3
,
rtol
=
1e-2
)
@
pytest
.
mark
.
parametrize
(
"xnorm"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"xnorm"
,
[
False
,
True
])
...
...
tests/lora/conftest.py
View file @
0da93439
...
@@ -294,6 +294,11 @@ def whisper_lora_files():
...
@@ -294,6 +294,11 @@ def whisper_lora_files():
return
snapshot_download
(
repo_id
=
"chengyili2005/whisper-small-mandarin-lora"
)
return
snapshot_download
(
repo_id
=
"chengyili2005/whisper-small-mandarin-lora"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
qwen35_dense_model_lora_files
():
return
snapshot_download
(
repo_id
=
"jeeejeee/qwen35-4b-text-only-sql-lora"
)
@
pytest
.
fixture
@
pytest
.
fixture
def
reset_default_device
():
def
reset_default_device
():
"""
"""
...
...
tests/lora/test_lora_manager.py
View file @
0da93439
...
@@ -711,3 +711,192 @@ def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, devic
...
@@ -711,3 +711,192 @@ def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, devic
torch
.
testing
.
assert_close
(
torch
.
testing
.
assert_close
(
packed_lora1
.
lora_b
[
1
],
model_lora_clone1
.
get_lora
(
"up_proj"
).
lora_b
packed_lora1
.
lora_b
[
1
],
model_lora_clone1
.
get_lora
(
"up_proj"
).
lora_b
)
)
def
_test_target_modules
(
model
,
target_modules
:
list
[
str
]
|
None
,
device
:
str
,
expected_lora
:
list
[
tuple
[
str
,
type
]],
expected_no_lora
:
list
[
tuple
[
str
,
type
]],
):
"""Create a LoRAModelManager and assert which modules have LoRA applied."""
LoRAModelManager
(
model
,
2
,
2
,
2
,
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
2
,
max_loras
=
2
,
lora_dtype
=
DEFAULT_DTYPE
,
target_modules
=
target_modules
,
),
device
=
device
,
)
for
module_path
,
lora_cls
in
expected_lora
:
assert
isinstance
(
model
.
get_submodule
(
module_path
),
lora_cls
)
for
module_path
,
lora_cls
in
expected_no_lora
:
assert
not
isinstance
(
model
.
get_submodule
(
module_path
),
lora_cls
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_target_modules_config
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
"""Test that target_modules config restricts which modules get LoRA applied."""
_test_target_modules
(
dummy_model
,
[
"dense1"
],
device
,
expected_lora
=
[
(
"dense1"
,
ColumnParallelLinearWithLoRA
),
(
"layer1.dense1"
,
ColumnParallelLinearWithLoRA
),
],
expected_no_lora
=
[
(
"dense2"
,
RowParallelLinearWithLoRA
),
(
"layer1.dense2"
,
RowParallelLinearWithLoRA
),
],
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_target_modules_multiple
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
"""Test that multiple target_modules work correctly."""
_test_target_modules
(
dummy_model
,
[
"dense1"
,
"dense2"
],
device
,
expected_lora
=
[
(
"dense1"
,
ColumnParallelLinearWithLoRA
),
(
"layer1.dense1"
,
ColumnParallelLinearWithLoRA
),
(
"dense2"
,
RowParallelLinearWithLoRA
),
(
"layer1.dense2"
,
RowParallelLinearWithLoRA
),
],
expected_no_lora
=
[],
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_target_modules_none_uses_all
(
default_vllm_config
,
dist_init
,
dummy_model
,
device
):
"""Test that target_modules=None uses all supported modules."""
_test_target_modules
(
dummy_model
,
None
,
device
,
expected_lora
=
[
(
"dense1"
,
ColumnParallelLinearWithLoRA
),
(
"layer1.dense1"
,
ColumnParallelLinearWithLoRA
),
(
"dense2"
,
RowParallelLinearWithLoRA
),
(
"layer1.dense2"
,
RowParallelLinearWithLoRA
),
],
expected_no_lora
=
[],
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_load_adapter_warns_on_unsupported_modules
(
default_vllm_config
,
dist_init
,
dummy_model_gate_up
,
device
,
tmp_path
):
"""Test that _load_adapter warns when a LoRA adapter contains modules
not in the model's supported LoRA target modules."""
from
unittest.mock
import
patch
import
vllm.lora.worker_manager
as
wm_module
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
)
dummy_lora_files
=
f
"
{
tmp_path
}
/lora_adapter"
os
.
makedirs
(
dummy_lora_files
,
exist_ok
=
True
)
create_peft_lora
(
dummy_model_gate_up
,
save_dir
=
dummy_lora_files
,
target_modules
=
[
"layer1.dense1"
,
"dense2"
],
lora_dtype
=
DEFAULT_DTYPE
,
)
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
lora_config
=
lora_config
)
vllm_config
.
scheduler_config
.
max_num_seqs
=
4
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
2
worker_manager
=
WorkerLoRAManager
(
vllm_config
,
device
,
EMBEDDING_MODULES
)
worker_manager
.
vocab_size
=
dummy_model_gate_up
.
unpadded_vocab_size
worker_manager
.
create_lora_manager
(
dummy_model_gate_up
)
# Patch from_local_checkpoint to inject an unsupported module
original_from_checkpoint
=
LoRAModel
.
from_local_checkpoint
def
patched_from_checkpoint
(
*
args
,
**
kwargs
):
lora
=
original_from_checkpoint
(
*
args
,
**
kwargs
)
lora
.
loras
[
"unsupported_module"
]
=
LoRALayerWeights
(
module_name
=
"unsupported_module"
,
rank
=
8
,
lora_alpha
=
16
,
lora_a
=
torch
.
randn
(
8
,
10
),
lora_b
=
torch
.
randn
(
10
,
8
),
)
return
lora
lora_request
=
LoRARequest
(
"test"
,
1
,
dummy_lora_files
)
with
(
patch
.
object
(
LoRAModel
,
"from_local_checkpoint"
,
patched_from_checkpoint
),
patch
.
object
(
wm_module
.
logger
,
"warning_once"
)
as
mock_warning
,
):
worker_manager
.
_load_adapter
(
lora_request
)
warning_args
=
mock_warning
.
call_args_list
found
=
any
(
"unsupported_module"
in
str
(
call
)
for
call
in
warning_args
)
assert
found
,
(
f
"Expected warning about 'unsupported_module', got:
{
warning_args
}
"
)
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_load_adapter_warns_on_target_modules_restriction
(
default_vllm_config
,
dist_init
,
dummy_model_gate_up
,
device
,
tmp_path
):
"""Test that _load_adapter warns when a LoRA adapter contains modules
excluded by the deployment-time target_modules restriction."""
from
unittest.mock
import
patch
import
vllm.lora.worker_manager
as
wm_module
# Restrict to only dense2 — adapter has dense1 which will be excluded
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
4
,
max_loras
=
4
,
lora_dtype
=
DEFAULT_DTYPE
,
target_modules
=
[
"dense2"
],
)
dummy_lora_files
=
f
"
{
tmp_path
}
/lora_adapter"
os
.
makedirs
(
dummy_lora_files
,
exist_ok
=
True
)
create_peft_lora
(
dummy_model_gate_up
,
save_dir
=
dummy_lora_files
,
target_modules
=
[
"layer1.dense1"
,
"dense2"
],
lora_dtype
=
DEFAULT_DTYPE
,
)
model_config
=
ModelConfig
(
max_model_len
=
16
)
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
lora_config
=
lora_config
)
vllm_config
.
scheduler_config
.
max_num_seqs
=
4
vllm_config
.
scheduler_config
.
max_num_batched_tokens
=
2
worker_manager
=
WorkerLoRAManager
(
vllm_config
,
device
,
EMBEDDING_MODULES
)
worker_manager
.
vocab_size
=
dummy_model_gate_up
.
unpadded_vocab_size
worker_manager
.
create_lora_manager
(
dummy_model_gate_up
)
lora_request
=
LoRARequest
(
"test"
,
1
,
dummy_lora_files
)
with
patch
.
object
(
wm_module
.
logger
,
"warning_once"
)
as
mock_warning
:
worker_manager
.
_load_adapter
(
lora_request
)
warning_args
=
mock_warning
.
call_args_list
# dense1 is supported by the model but excluded by target_modules
found
=
any
(
"target_modules"
in
str
(
call
)
for
call
in
warning_args
)
assert
found
,
(
f
"Expected warning about target_modules restriction, got:
{
warning_args
}
"
)
tests/lora/test_lora_utils.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.lora.utils
import
is_in_target_modules
,
is_supported_lora_module
class
TestIsSupportedLoraModule
:
"""Tests for is_supported_lora_module (model-definition check)."""
def
test_suffix_match
(
self
):
assert
is_supported_lora_module
(
"model.layers.0.self_attn.o_proj"
,
[
"o_proj"
,
"q_proj"
]
)
def
test_no_match
(
self
):
assert
not
is_supported_lora_module
(
"model.layers.0.self_attn.o_proj"
,
[
"q_proj"
,
"k_proj"
]
)
def
test_exact_match
(
self
):
assert
is_supported_lora_module
(
"o_proj"
,
[
"o_proj"
])
def
test_regex_suffix_matching
(
self
):
"""Regex anchors to end — partial suffix should not match."""
assert
not
is_supported_lora_module
(
"model.layers.0.self_attn.o_proj"
,
[
"proj"
])
def
test_empty_supported_modules
(
self
):
assert
not
is_supported_lora_module
(
"model.layers.0.self_attn.o_proj"
,
[])
def
test_multiple_supported_modules
(
self
):
supported
=
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
"o_proj"
]
assert
is_supported_lora_module
(
"model.layers.0.self_attn.v_proj"
,
supported
)
assert
not
is_supported_lora_module
(
"model.layers.0.mlp.gate_proj"
,
supported
)
class
TestIsInTargetModules
:
"""Tests for is_in_target_modules (deployment-time filter)."""
def
test_none_allows_all
(
self
):
assert
is_in_target_modules
(
"model.layers.0.self_attn.o_proj"
,
None
)
def
test_suffix_in_target
(
self
):
assert
is_in_target_modules
(
"model.layers.0.self_attn.o_proj"
,
[
"o_proj"
,
"q_proj"
]
)
def
test_suffix_not_in_target
(
self
):
assert
not
is_in_target_modules
(
"model.layers.0.self_attn.o_proj"
,
[
"q_proj"
,
"k_proj"
]
)
def
test_empty_target_modules
(
self
):
assert
not
is_in_target_modules
(
"model.layers.0.self_attn.o_proj"
,
[])
def
test_exact_name_match
(
self
):
assert
is_in_target_modules
(
"dense1"
,
[
"dense1"
,
"dense2"
])
def
test_exact_name_no_match
(
self
):
assert
not
is_in_target_modules
(
"dense3"
,
[
"dense1"
,
"dense2"
])
tests/lora/test_qwen35_densemoel_lora.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
transformers
import
AutoTokenizer
import
vllm
import
vllm.config
from
vllm.lora.request
import
LoRARequest
from
..utils
import
create_new_process_for_each_test
,
multi_gpu_test
MODEL_PATH
=
"Qwen/Qwen3.5-4B"
PROMPT_TEMPLATE
=
"""Write a SQL query for the given database.
\n
Schema:
\n
Tables:
\n
- stadium(Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average)
\n
- singer(Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male)
\n
- concert(concert_ID, concert_Name, Theme, Stadium_ID, Year)
\n
- singer_in_concert(concert_ID, Singer_ID)
\n\n
Question:
\n
{query}"""
# noqa: E501
EXPECTED_LORA_OUTPUT
=
[
"SELECT count(*) FROM singer"
,
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'"
,
"SELECT name FROM stadium WHERE stadium_id NOT IN (SELECT stadium_id FROM concert)"
,
]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
MODEL_PATH
,
trust_remote_code
=
True
)
def
do_sample
(
llm
:
vllm
.
LLM
,
lora_path
:
str
,
lora_id
:
int
)
->
list
[
str
]:
prompts
=
[
PROMPT_TEMPLATE
.
format
(
query
=
"How many singers do we have?"
),
PROMPT_TEMPLATE
.
format
(
query
=
(
"What is the average, minimum, and maximum "
"age of all singers from France?"
)
),
PROMPT_TEMPLATE
.
format
(
query
=
(
"What are the names of the stadiums without any concerts?"
)
),
]
input_templates
=
[]
for
prmpt
in
prompts
:
messages
=
[{
"role"
:
"user"
,
"content"
:
prmpt
}]
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
,
enable_thinking
=
False
,
# disable thinking
)
input_templates
.
append
(
prompt
)
sampling_params
=
vllm
.
SamplingParams
(
temperature
=
0
,
max_tokens
=
512
)
outputs
=
llm
.
generate
(
input_templates
,
sampling_params
,
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
lora_path
)
if
lora_id
else
None
,
)
generated_texts
:
list
[
str
]
=
[]
for
output
in
outputs
:
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
.
strip
()
generated_texts
.
append
(
generated_text
)
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
return
generated_texts
@
create_new_process_for_each_test
()
def
test_qwen35_dense_model_lora
(
qwen35_dense_model_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
512
,
enable_lora
=
True
,
max_loras
=
2
,
max_num_seqs
=
16
,
max_lora_rank
=
8
,
trust_remote_code
=
True
,
)
output1
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
1
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output1
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
output2
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
2
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output2
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
@
multi_gpu_test
(
num_gpus
=
4
)
def
test_qwen35_dense_model_lora_tp4
(
qwen35_dense_model_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
1024
,
enable_lora
=
True
,
max_loras
=
2
,
max_lora_rank
=
8
,
max_num_seqs
=
16
,
tensor_parallel_size
=
4
,
trust_remote_code
=
True
,
fully_sharded_loras
=
False
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
output1
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
1
)
print
(
output1
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output1
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
output2
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
2
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output2
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
@
multi_gpu_test
(
num_gpus
=
4
)
def
test_qwen35_dense_model_lora_tp4_fully_sharded_loras
(
qwen35_dense_model_lora_files
):
llm
=
vllm
.
LLM
(
MODEL_PATH
,
max_model_len
=
512
,
enable_lora
=
True
,
max_loras
=
2
,
max_lora_rank
=
8
,
tensor_parallel_size
=
4
,
trust_remote_code
=
True
,
fully_sharded_loras
=
True
,
gpu_memory_utilization
=
0.8
,
compilation_config
=
vllm
.
config
.
CompilationConfig
(
# Avoid OOM
cudagraph_specialize_lora
=
False
,
),
)
output1
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
1
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output1
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
output2
=
do_sample
(
llm
,
qwen35_dense_model_lora_files
,
lora_id
=
2
)
for
i
in
range
(
len
(
EXPECTED_LORA_OUTPUT
)):
assert
output2
[
i
]
==
EXPECTED_LORA_OUTPUT
[
i
]
tests/model_executor/layers/test_rocm_unquantized_gemm.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
unittest.mock
import
MagicMock
import
pytest
import
torch
from
vllm.platforms
import
current_platform
if
current_platform
.
is_cuda
():
pytest
.
skip
(
"ROCm skinny GEMM tests are not supported on CUDA."
,
allow_module_level
=
True
,
)
from
vllm.model_executor.layers
import
utils
def
test_rocm_unquantized_gemm_gfx1x_wvsplitk_path
(
monkeypatch
):
x
=
torch
.
randn
(
1
,
64
,
dtype
=
torch
.
float16
)
weight
=
torch
.
randn
(
128
,
64
,
dtype
=
torch
.
float16
)
monkeypatch
.
setattr
(
utils
,
"use_aiter_triton_gemm"
,
lambda
*
args
:
False
)
monkeypatch
.
setattr
(
utils
.
envs
,
"VLLM_ROCM_USE_SKINNY_GEMM"
,
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx1x"
,
lambda
:
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx9"
,
lambda
:
False
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx950"
,
lambda
:
False
)
monkeypatch
.
setattr
(
utils
,
"get_cu_count"
,
lambda
:
120
)
wvsplitk_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
,
__
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"wvSplitK"
,
wvsplitk_mock
)
llmm1_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"LLMM1"
,
llmm1_mock
)
out
=
utils
.
rocm_unquantized_gemm_impl
(
x
,
weight
,
None
)
ref
=
torch
.
nn
.
functional
.
linear
(
x
,
weight
,
None
)
wvsplitk_mock
.
assert_called_once
()
llmm1_mock
.
assert_not_called
()
assert
torch
.
allclose
(
out
,
ref
,
atol
=
1e-3
,
rtol
=
1e-3
)
def
test_rocm_unquantized_gemm_gfx1x_n_gt_4_falls_back
(
monkeypatch
):
x
=
torch
.
randn
(
5
,
64
,
dtype
=
torch
.
float16
)
weight
=
torch
.
randn
(
128
,
64
,
dtype
=
torch
.
float16
)
monkeypatch
.
setattr
(
utils
,
"use_aiter_triton_gemm"
,
lambda
*
args
:
False
)
monkeypatch
.
setattr
(
utils
.
envs
,
"VLLM_ROCM_USE_SKINNY_GEMM"
,
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx1x"
,
lambda
:
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx9"
,
lambda
:
False
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx950"
,
lambda
:
False
)
monkeypatch
.
setattr
(
utils
,
"get_cu_count"
,
lambda
:
120
)
wvsplitk_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
,
__
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"wvSplitK"
,
wvsplitk_mock
)
llmm1_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"LLMM1"
,
llmm1_mock
)
out
=
utils
.
rocm_unquantized_gemm_impl
(
x
,
weight
,
None
)
ref
=
torch
.
nn
.
functional
.
linear
(
x
,
weight
,
None
)
wvsplitk_mock
.
assert_not_called
()
llmm1_mock
.
assert_not_called
()
assert
torch
.
allclose
(
out
,
ref
,
atol
=
1e-3
,
rtol
=
1e-3
)
def
test_rocm_unquantized_gemm_gfx950_wvsplitkrc_path
(
monkeypatch
):
x
=
torch
.
randn
(
16
,
1024
,
dtype
=
torch
.
float16
)
weight
=
torch
.
randn
(
256
,
1024
,
dtype
=
torch
.
float16
)
monkeypatch
.
setattr
(
utils
,
"use_aiter_triton_gemm"
,
lambda
*
args
:
False
)
monkeypatch
.
setattr
(
utils
.
envs
,
"VLLM_ROCM_USE_SKINNY_GEMM"
,
True
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx1x"
,
lambda
:
False
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx9"
,
lambda
:
False
)
monkeypatch
.
setattr
(
"vllm.platforms.rocm.on_gfx950"
,
lambda
:
True
)
monkeypatch
.
setattr
(
utils
,
"get_cu_count"
,
lambda
:
120
)
wvsplitkrc_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
,
__
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"wvSplitKrc"
,
wvsplitkrc_mock
)
wvsplitk_mock
=
MagicMock
(
side_effect
=
lambda
w
,
x_view
,
_
,
__
:
x_view
@
w
.
t
())
monkeypatch
.
setattr
(
utils
.
ops
,
"wvSplitK"
,
wvsplitk_mock
)
out
=
utils
.
rocm_unquantized_gemm_impl
(
x
,
weight
,
None
)
ref
=
torch
.
nn
.
functional
.
linear
(
x
,
weight
,
None
)
wvsplitkrc_mock
.
assert_called_once
()
wvsplitk_mock
.
assert_not_called
()
assert
torch
.
allclose
(
out
,
ref
,
atol
=
1e-3
,
rtol
=
1e-3
)
tests/models/language/pooling/test_colbert.py
View file @
0da93439
...
@@ -59,6 +59,22 @@ COLBERT_MODELS = {
...
@@ -59,6 +59,22 @@ COLBERT_MODELS = {
"model_cls"
:
"AutoModel"
,
"model_cls"
:
"AutoModel"
,
},
},
},
},
"lfm2"
:
{
"model"
:
"LiquidAI/LFM2-ColBERT-350M"
,
"colbert_dim"
:
128
,
"max_model_len"
:
511
,
"extra_kwargs"
:
{
"hf_overrides"
:
{
"architectures"
:
[
"ColBERTLfm2Model"
],
},
},
"hf_comparison"
:
{
"weights_file"
:
"1_Dense/model.safetensors"
,
"weights_key"
:
"linear.weight"
,
"trust_remote_code"
:
False
,
"model_cls"
:
"AutoModel"
,
},
},
}
}
...
...
tests/models/multimodal/generation/test_common.py
View file @
0da93439
...
@@ -220,7 +220,10 @@ VLM_TEST_SETTINGS = {
...
@@ -220,7 +220,10 @@ VLM_TEST_SETTINGS = {
vllm_runner_kwargs
=
{
vllm_runner_kwargs
=
{
"model_impl"
:
"transformers"
,
"model_impl"
:
"transformers"
,
},
},
marks
=
[
pytest
.
mark
.
core_model
],
marks
=
[
pytest
.
mark
.
core_model
,
*
([
large_gpu_mark
(
min_gb
=
80
)]
if
current_platform
.
is_rocm
()
else
[]),
],
),
),
"idefics3-transformers"
:
VLMTestInfo
(
"idefics3-transformers"
:
VLMTestInfo
(
models
=
[
"HuggingFaceTB/SmolVLM-256M-Instruct"
],
models
=
[
"HuggingFaceTB/SmolVLM-256M-Instruct"
],
...
...
tests/models/multimodal/generation/test_granite_speech.py
View file @
0da93439
...
@@ -39,7 +39,11 @@ models = [MODEL_NAME]
...
@@ -39,7 +39,11 @@ models = [MODEL_NAME]
def
granite_speech_attention_config
():
def
granite_speech_attention_config
():
"""Return attention config for Granite Speech tests on ROCm."""
"""Return attention config for Granite Speech tests on ROCm."""
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
return
{
"backend"
:
"ROCM_AITER_FA"
}
from
vllm.platforms.rocm
import
on_mi3xx
if
on_mi3xx
():
return
{
"backend"
:
"ROCM_AITER_FA"
}
return
{
"backend"
:
"TRITON_ATTN"
}
return
None
return
None
...
...
tests/models/multimodal/generation/test_keye.py
View file @
0da93439
...
@@ -24,12 +24,8 @@ class ModelRequestData(NamedTuple):
...
@@ -24,12 +24,8 @@ class ModelRequestData(NamedTuple):
sampling_params
:
SamplingParams
|
None
=
None
sampling_params
:
SamplingParams
|
None
=
None
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"question"
,
[
QUESTION
])
@
pytest
.
mark
.
parametrize
(
"question"
,
[
QUESTION
])
def
test_keye_vl
(
def
test_keye_vl
(
image_assets
,
question
:
str
):
image_assets
,
question
:
str
,
):
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
images
=
[
asset
.
pil_image
for
asset
in
image_assets
]
image_urls
=
[
encode_image_url
(
image
)
for
image
in
images
]
image_urls
=
[
encode_image_url
(
image
)
for
image
in
images
]
...
...
tests/models/multimodal/generation/test_nemotron_parse.py
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Iterable
,
Sequence
from
collections.abc
import
Sequence
import
pytest
import
pytest
import
regex
as
re
from
transformers
import
AutoModel
from
transformers
import
AutoModel
from
tests.models.utils
import
check_logprobs_close
from
tests.models.utils
import
check_logprobs_close
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.logprobs
import
Logprob
,
SampleLogprobs
from
vllm.tokenizers
import
TokenizerLike
from
....conftest
import
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
create_new_process_for_each_test
IMAGE
=
ImageAsset
(
"paper-11"
).
pil_image_ext
(
ext
=
"png"
).
convert
(
"RGB"
)
IMAGE
=
ImageAsset
(
"paper-11"
).
pil_image_ext
(
ext
=
"png"
).
convert
(
"RGB"
)
PROMPT
=
"</s><s><predict_bbox><predict_classes><output_markdown>"
PROMPT
=
"</s><s><predict_bbox><predict_classes><output_markdown>"
class
DummyLogprobs
(
dict
[
int
,
Logprob
]):
def
__init__
(
self
,
vocab_ids
:
Iterable
[
int
]):
super
().
__init__
(
dict
.
fromkeys
(
vocab_ids
,
Logprob
(
0.0
)))
def
__repr__
(
self
):
return
"DummyLogprobs()"
def
mask_bbox_tokens
(
output
:
tuple
[
list
[
int
],
str
,
SampleLogprobs
],
tokenizer
:
TokenizerLike
,
)
->
tuple
[
list
[
int
],
str
,
SampleLogprobs
]:
"""
Always pass check_logprobs_close check for bounding box tokens
because it is reasonable for them to differ slightly.
"""
ignore_pattern
=
r
"<[xy]_[\d.]+>"
vocab
=
tokenizer
.
get_vocab
()
output_ids
,
output_str
,
out_logprobs
=
output
masked_logprobs
=
list
[
dict
[
int
,
Logprob
]]()
for
token
,
logprobs
in
zip
(
output_ids
,
out_logprobs
):
if
re
.
match
(
ignore_pattern
,
tokenizer
.
decode
(
token
)):
masked_logprobs
.
append
(
DummyLogprobs
(
vocab
.
values
()))
else
:
masked_logprobs
.
append
(
logprobs
)
return
output_ids
,
output_str
,
masked_logprobs
def
run_test
(
def
run_test
(
hf_runner
:
type
[
HfRunner
],
hf_runner
:
type
[
HfRunner
],
vllm_runner
:
type
[
VllmRunner
],
vllm_runner
:
type
[
VllmRunner
],
...
@@ -44,6 +76,8 @@ def run_test(
...
@@ -44,6 +76,8 @@ def run_test(
for
prompts
,
images
in
inputs
for
prompts
,
images
in
inputs
]
]
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModel
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModel
)
as
hf_model
:
hf_outputs_per_case
=
[
hf_outputs_per_case
=
[
hf_model
.
generate_greedy_logprobs_limit
(
hf_model
.
generate_greedy_logprobs_limit
(
...
@@ -58,18 +92,20 @@ def run_test(
...
@@ -58,18 +92,20 @@ def run_test(
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
for
hf_outputs
,
vllm_outputs
in
zip
(
hf_outputs_per_case
,
vllm_outputs_per_case
):
check_logprobs_close
(
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_0_lst
=
[
outputs_1_lst
=
vllm_outputs
,
mask_bbox_tokens
(
output
,
tokenizer
)
for
output
in
hf_outputs
],
outputs_1_lst
=
[
mask_bbox_tokens
(
output
,
tokenizer
)
for
output
in
vllm_outputs
],
name_0
=
"hf"
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
name_1
=
"vllm"
,
)
)
@
pytest
.
mark
.
core_model
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"nvidia/NVIDIA-Nemotron-Parse-v1.1"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"nvidia/NVIDIA-Nemotron-Parse-v1.1"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
create_new_process_for_each_test
(
"spawn"
)
def
test_models
(
def
test_models
(
hf_runner
,
vllm_runner
,
model
:
str
,
dtype
:
str
,
num_logprobs
:
int
hf_runner
,
vllm_runner
,
model
:
str
,
dtype
:
str
,
num_logprobs
:
int
)
->
None
:
)
->
None
:
...
@@ -77,10 +113,7 @@ def test_models(
...
@@ -77,10 +113,7 @@ def test_models(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
inputs
=
[
inputs
=
[
(
([
PROMPT
]
*
10
,
[
IMAGE
]
*
10
),
[
PROMPT
]
*
10
,
[
IMAGE
]
*
10
,
),
],
],
model
=
model
,
model
=
model
,
dtype
=
dtype
,
dtype
=
dtype
,
...
...
tests/models/multimodal/generation/vlm_utils/builders.py
View file @
0da93439
...
@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
...
@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
test_info
.
audio_idx_to_prompt
,
test_info
.
audio_idx_to_prompt
,
test_info
.
prompt_formatter
,
test_info
.
prompt_formatter
,
)
)
resampler
=
AudioResampler
(
resampler
=
AudioResampler
(
target_sr
=
16000
)
target_sr
=
16000
,
method
=
"librosa"
,
)
audios
=
[
asset
.
audio_and_sample_rate
for
asset
in
audio_assets
]
audios
=
[
asset
.
audio_and_sample_rate
for
asset
in
audio_assets
]
resampled_audios
=
[
resampled_audios
=
[
(
(
...
...
tests/models/multimodal/generation/vlm_utils/model_utils.py
View file @
0da93439
...
@@ -24,6 +24,7 @@ from transformers import (
...
@@ -24,6 +24,7 @@ from transformers import (
GenerationConfig
,
GenerationConfig
,
GenerationMixin
,
GenerationMixin
,
)
)
from
transformers.masking_utils
import
create_causal_mask
from
transformers.video_utils
import
VideoMetadata
from
transformers.video_utils
import
VideoMetadata
from
vllm.logprobs
import
SampleLogprobs
from
vllm.logprobs
import
SampleLogprobs
...
@@ -489,13 +490,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -489,13 +490,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self
.
image_size
=
self
.
vision_config
.
image_size
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Image
|
list
[
Image
],
**
kwargs
):
def
__call__
(
self
,
text
:
str
,
images
:
Image
|
list
[
Image
],
**
kwargs
):
from
vllm.model_executor.models.h2ovl
import
(
from
vllm.transformers_utils.processors.h2ovl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values_h2ovl
,
image_to_pixel_values_h2ovl
,
)
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
pixel_values
=
[
image_to_pixel_values_h2ovl
(
image_to_pixel_values_h2ovl
(
...
@@ -679,10 +681,14 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -679,10 +681,14 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
sin
=
sin
.
to
(
inputs_embeds
.
dtype
)
sin
=
sin
.
to
(
inputs_embeds
.
dtype
)
# Prepare attention mask
# Prepare attention mask
if
attention_mask
is
not
None
:
attention_mask
=
create_causal_mask
(
attention_mask
=
self
.
_update_causal_mask
(
config
=
self
.
config
,
attention_mask
,
inputs_embeds
,
cache_position
,
past_key_values
,
False
input_embeds
=
inputs_embeds
,
)
attention_mask
=
attention_mask
,
past_key_values
=
past_key_values
,
position_ids
=
position_ids
,
cache_position
=
cache_position
,
)
# Initialize and collect hidden states
# Initialize and collect hidden states
hidden_states
=
inputs_embeds
hidden_states
=
inputs_embeds
...
@@ -751,16 +757,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -751,16 +757,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self
.
image_size
=
self
.
vision_config
.
image_size
self
.
image_size
=
self
.
vision_config
.
image_size
def
__call__
(
self
,
text
:
str
,
images
:
Image
|
list
[
Image
],
**
kwargs
):
def
__call__
(
self
,
text
:
str
,
images
:
Image
|
list
[
Image
],
**
kwargs
):
from
vllm.model_executor.models.skyworkr1v
import
(
from
vllm.transformers_utils.processors.internvl
import
(
IMG_CONTEXT
,
image_to_pixel_values_internvl
,
IMG_END
,
IMG_START
,
image_to_pixel_values_skyworkr1v
,
)
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
pixel_values
=
[
pixel_values
=
[
image_to_pixel_values_
skyworkr1v
(
image_to_pixel_values_
internvl
(
image
,
image
,
input_size
=
self
.
image_size
,
input_size
=
self
.
image_size
,
min_num
=
self
.
min_num
,
min_num
=
self
.
min_num
,
...
@@ -815,14 +822,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
...
@@ -815,14 +822,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
]
=
None
,
videos
:
npt
.
NDArray
|
list
[
npt
.
NDArray
]
=
None
,
**
kwargs
,
**
kwargs
,
):
):
from
vllm.model_executor.models.internvl
import
(
from
vllm.transformers_utils.processors.internvl
import
(
IMG_CONTEXT
,
IMG_END
,
IMG_START
,
image_to_pixel_values_internvl
,
image_to_pixel_values_internvl
,
video_to_pixel_values_internvl
,
video_to_pixel_values_internvl
,
)
)
IMG_START
=
"<img>"
IMG_END
=
"</img>"
IMG_CONTEXT
=
"<IMG_CONTEXT>"
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
images
=
[
images
]
if
isinstance
(
images
,
Image
)
else
images
videos
=
[
videos
]
if
isinstance
(
videos
,
np
.
ndarray
)
else
videos
videos
=
[
videos
]
if
isinstance
(
videos
,
np
.
ndarray
)
else
videos
if
images
is
not
None
:
if
images
is
not
None
:
...
@@ -1260,9 +1268,9 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner":
...
@@ -1260,9 +1268,9 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner":
generated).
generated).
"""
"""
import
base64
import
io
import
io
import
pybase64
as
base64
import
soundfile
as
sf
import
soundfile
as
sf
processor
=
hf_model
.
processor
processor
=
hf_model
.
processor
...
...
tests/models/multimodal/pooling/test_colpali.py
View file @
0da93439
...
@@ -7,9 +7,9 @@ ColPali is a multi-vector retrieval model based on PaliGemma backbone
...
@@ -7,9 +7,9 @@ ColPali is a multi-vector retrieval model based on PaliGemma backbone
It produces per-token embeddings for both text and image inputs.
It produces per-token embeddings for both text and image inputs.
"""
"""
import
base64
from
io
import
BytesIO
from
io
import
BytesIO
import
pybase64
as
base64
import
pytest
import
pytest
import
torch
import
torch
from
PIL
import
Image
from
PIL
import
Image
...
...
tests/models/multimodal/pooling/test_colqwen3.py
View file @
0da93439
...
@@ -7,9 +7,9 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token
...
@@ -7,9 +7,9 @@ ColBERT-style late interaction scoring (MaxSim). It produces per-token
embeddings for both text and image inputs.
embeddings for both text and image inputs.
"""
"""
import
base64
from
io
import
BytesIO
from
io
import
BytesIO
import
pybase64
as
base64
import
pytest
import
pytest
import
torch
import
torch
from
PIL
import
Image
from
PIL
import
Image
...
...
tests/models/multimodal/pooling/test_colqwen3_5.py
0 → 100644
View file @
0da93439
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for ColQwen3.5 late interaction model for multi-modal retrieval.
ColQwen3.5 is a multi-vector retrieval model based on Qwen3.5 backbone with
ColBERT-style late interaction scoring (MaxSim). It produces per-token
embeddings for both text and image inputs.
"""
import
pytest
import
torch
from
....conftest
import
VllmRunner
MODELS
=
[
"athrael-soju/colqwen3.5-4.5B-v3"
,
]
EMBED_DIMS
=
{
"athrael-soju/colqwen3.5-4.5B-v3"
:
320
,
}
TEXT_QUERIES
=
[
"What is the capital of France?"
,
"Describe the contents of the document."
,
]
TEXT_DOCUMENTS
=
[
"The capital of France is Paris."
,
"This document contains important financial data."
,
]
DTYPE
=
"half"
def
_run_token_embed_test
(
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
*
,
dtype
:
str
,
)
->
None
:
"""Verify per-token embedding shape and L2 normalization."""
with
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
dtype
,
max_model_len
=
4096
,
enforce_eager
=
True
,
)
as
vllm_model
:
outputs
=
vllm_model
.
token_embed
([
TEXT_QUERIES
[
0
]])
assert
len
(
outputs
)
==
1
emb
=
torch
.
tensor
(
outputs
[
0
])
# Token embeddings should be 2D: [num_tokens, embed_dim]
assert
emb
.
dim
()
==
2
assert
emb
.
shape
[
1
]
==
EMBED_DIMS
[
model
]
assert
emb
.
shape
[
0
]
>
1
# Verify L2 normalization
norms
=
torch
.
norm
(
emb
,
p
=
2
,
dim
=-
1
)
torch
.
testing
.
assert_close
(
norms
,
torch
.
ones_like
(
norms
),
rtol
=
1e-2
,
atol
=
1e-2
,
)
def
_run_late_interaction_test
(
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
*
,
dtype
:
str
,
)
->
None
:
"""Verify MaxSim scoring matches manual computation."""
from
vllm.entrypoints.pooling.score.utils
import
compute_maxsim_score
with
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
dtype
,
max_model_len
=
4096
,
enforce_eager
=
True
,
)
as
vllm_model
:
q_outputs
=
vllm_model
.
token_embed
([
TEXT_QUERIES
[
0
]])
d_outputs
=
vllm_model
.
token_embed
([
TEXT_DOCUMENTS
[
0
]])
q_emb
=
torch
.
tensor
(
q_outputs
[
0
])
d_emb
=
torch
.
tensor
(
d_outputs
[
0
])
manual_score
=
compute_maxsim_score
(
q_emb
,
d_emb
).
item
()
vllm_scores
=
vllm_model
.
score
(
TEXT_QUERIES
[
0
],
TEXT_DOCUMENTS
[
0
])
assert
len
(
vllm_scores
)
==
1
assert
vllm_scores
[
0
]
==
pytest
.
approx
(
manual_score
,
rel
=
0.01
)
def
_run_relevance_test
(
vllm_runner
:
type
[
VllmRunner
],
model
:
str
,
*
,
dtype
:
str
,
)
->
None
:
"""Verify that relevant documents score higher than irrelevant ones."""
query
=
"What is machine learning?"
documents
=
[
"Machine learning is a subset of artificial intelligence."
,
"The weather forecast shows rain tomorrow."
,
"Deep learning uses neural networks for complex tasks."
,
]
with
vllm_runner
(
model
,
runner
=
"pooling"
,
dtype
=
dtype
,
max_model_len
=
4096
,
enforce_eager
=
True
,
)
as
vllm_model
:
scores
=
vllm_model
.
score
(
query
,
documents
)
assert
len
(
scores
)
==
3
assert
scores
[
0
]
>
scores
[
1
],
"ML doc should score higher than weather doc"
assert
scores
[
2
]
>
scores
[
1
],
"DL doc should score higher than weather doc"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
DTYPE
])
def
test_colqwen3_5_token_embed
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
_run_token_embed_test
(
vllm_runner
,
model
,
dtype
=
dtype
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
DTYPE
])
def
test_colqwen3_5_late_interaction_scoring
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
_run_late_interaction_test
(
vllm_runner
,
model
,
dtype
=
dtype
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
DTYPE
])
def
test_colqwen3_5_relevance_ordering
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
_run_relevance_test
(
vllm_runner
,
model
,
dtype
=
dtype
)
tests/models/multimodal/pooling/test_llama_nemotron_vl.py
View file @
0da93439
...
@@ -9,10 +9,10 @@ Tests for the LlamaNemotronVL model family:
...
@@ -9,10 +9,10 @@ Tests for the LlamaNemotronVL model family:
Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone.
Both variants share a SigLIP vision encoder with a bidirectional LLaMA backbone.
"""
"""
import
base64
from
io
import
BytesIO
from
io
import
BytesIO
from
pathlib
import
Path
from
pathlib
import
Path
import
pybase64
as
base64
import
pytest
import
pytest
import
torch
import
torch
from
transformers
import
AutoModel
,
AutoModelForSequenceClassification
,
AutoProcessor
from
transformers
import
AutoModel
,
AutoModelForSequenceClassification
,
AutoProcessor
...
@@ -22,8 +22,10 @@ from vllm.entrypoints.chat_utils import (
...
@@ -22,8 +22,10 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartTextParam
,
ChatCompletionContentPartTextParam
,
)
)
from
vllm.entrypoints.pooling.score.utils
import
ScoreMultiModalParam
from
vllm.entrypoints.pooling.score.utils
import
ScoreMultiModalParam
from
vllm.platforms
import
current_platform
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....utils
import
ROCM_ENGINE_KWARGS
from
...utils
import
check_embeddings_close
from
...utils
import
check_embeddings_close
# Prefixes used by the model API
# Prefixes used by the model API
...
@@ -70,6 +72,7 @@ def _run_test(
...
@@ -70,6 +72,7 @@ def _run_test(
max_model_len
=
2048
,
max_model_len
=
2048
,
enforce_eager
=
True
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
**
ROCM_ENGINE_KWARGS
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
)
vllm_outputs
=
vllm_model
.
embed
(
input_texts
,
images
=
input_images
)
...
@@ -250,6 +253,7 @@ def _run_vllm_reranker(
...
@@ -250,6 +253,7 @@ def _run_vllm_reranker(
max_model_len
=
2048
,
max_model_len
=
2048
,
enforce_eager
=
True
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
**
ROCM_ENGINE_KWARGS
,
)
as
vllm_model
:
)
as
vllm_model
:
has_images
=
any
(
img
is
not
None
for
_
,
img
in
docs
)
has_images
=
any
(
img
is
not
None
for
_
,
img
in
docs
)
...
@@ -322,8 +326,11 @@ def _run_reranker_test(
...
@@ -322,8 +326,11 @@ def _run_reranker_test(
assert
len
(
hf_scores
)
==
len
(
vllm_scores
),
(
assert
len
(
hf_scores
)
==
len
(
vllm_scores
),
(
f
"Output length mismatch: HF=
{
len
(
hf_scores
)
}
, vLLM=
{
len
(
vllm_scores
)
}
"
f
"Output length mismatch: HF=
{
len
(
hf_scores
)
}
, vLLM=
{
len
(
vllm_scores
)
}
"
)
)
# NOTE: ROCm shows slightly higher numerical variance dues to different attention
# backend between vLLM and HF; use a marginally looser tolerance
rel_tol
=
0.022
if
current_platform
.
is_rocm
()
else
0.02
for
i
,
(
hf_score
,
vllm_score
)
in
enumerate
(
zip
(
hf_scores
,
vllm_scores
)):
for
i
,
(
hf_score
,
vllm_score
)
in
enumerate
(
zip
(
hf_scores
,
vllm_scores
)):
assert
hf_score
==
pytest
.
approx
(
vllm_score
,
rel
=
0.02
),
(
assert
hf_score
==
pytest
.
approx
(
vllm_score
,
rel
=
rel_tol
),
(
f
"Score mismatch at index
{
i
}
: HF=
{
hf_score
:.
4
f
}
, vLLM=
{
vllm_score
:.
4
f
}
"
f
"Score mismatch at index
{
i
}
: HF=
{
hf_score
:.
4
f
}
, vLLM=
{
vllm_score
:.
4
f
}
"
)
)
...
...
tests/models/multimodal/pooling/test_phi3v.py
View file @
0da93439
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
import
pytest
import
pytest
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
import
transformers.utils
from
PIL
import
Image
from
PIL
import
Image
from
vllm.assets.base
import
get_vllm_public_assets
from
vllm.assets.base
import
get_vllm_public_assets
...
@@ -12,6 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
...
@@ -12,6 +13,12 @@ from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from
....utils
import
large_gpu_test
from
....utils
import
large_gpu_test
from
...utils
import
check_embeddings_close
from
...utils
import
check_embeddings_close
# BC for method that was deleted in Transformers v5.
# Only needed for generating the HF reference.
transformers
.
utils
.
is_flash_attn_greater_or_equal_2_10
=
(
lambda
:
transformers
.
utils
.
is_flash_attn_greater_or_equal
(
"2.1.0"
)
)
HF_TEXT_PROMPTS
=
[
HF_TEXT_PROMPTS
=
[
# T -> X
# T -> X
"Find me an everyday image that matches the given caption: The label of the object is stop sign"
,
# noqa: E501
"Find me an everyday image that matches the given caption: The label of the object is stop sign"
,
# noqa: E501
...
...
tests/models/multimodal/processing/test_h2ovl.py
View file @
0da93439
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
...
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
min_num
:
int
,
min_num
:
int
,
max_num
:
int
,
max_num
:
int
,
):
):
from
vllm.
model_executor.model
s.h2ovl
import
(
from
vllm.
transformers_utils.processor
s.h2ovl
import
(
calculate_h2ovl_targets
,
calculate_h2ovl_targets
,
get_h2ovl_target_ratios
,
get_h2ovl_target_ratios
,
)
)
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment