Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
713 additions
and
271 deletions
+713
-271
tests/lora/test_punica_ops.py
tests/lora/test_punica_ops.py
+3
-3
tests/lora/test_qwenvl.py
tests/lora/test_qwenvl.py
+127
-4
tests/lora/test_utils.py
tests/lora/test_utils.py
+5
-2
tests/model_executor/model_loader/runai_streamer_loader/conftest.py
...l_executor/model_loader/runai_streamer_loader/conftest.py
+1
-5
tests/model_executor/model_loader/tensorizer_loader/conftest.py
...model_executor/model_loader/tensorizer_loader/conftest.py
+1
-1
tests/model_executor/test_eagle_quantization.py
tests/model_executor/test_eagle_quantization.py
+1
-1
tests/model_executor/test_model_load_with_params.py
tests/model_executor/test_model_load_with_params.py
+8
-10
tests/models/fixtures/qwen2_5_math_prm_reward_step.json
tests/models/fixtures/qwen2_5_math_prm_reward_step.json
+1
-0
tests/models/language/generation/conftest.py
tests/models/language/generation/conftest.py
+28
-0
tests/models/language/generation/test_common.py
tests/models/language/generation/test_common.py
+16
-7
tests/models/language/generation/test_grok.py
tests/models/language/generation/test_grok.py
+43
-0
tests/models/language/generation/test_phimoe.py
tests/models/language/generation/test_phimoe.py
+13
-0
tests/models/language/pooling/conftest.py
tests/models/language/pooling/conftest.py
+29
-0
tests/models/language/pooling/test_embedding.py
tests/models/language/pooling/test_embedding.py
+1
-1
tests/models/language/pooling/test_mm_classifier_conversion.py
.../models/language/pooling/test_mm_classifier_conversion.py
+1
-1
tests/models/language/pooling/test_pooler_config_init_behaviour.py
...els/language/pooling/test_pooler_config_init_behaviour.py
+4
-4
tests/models/language/pooling/test_reward.py
tests/models/language/pooling/test_reward.py
+63
-4
tests/models/language/pooling/test_token_classification.py
tests/models/language/pooling/test_token_classification.py
+29
-10
tests/models/language/pooling_mteb_test/mteb_embed_utils.py
tests/models/language/pooling_mteb_test/mteb_embed_utils.py
+34
-218
tests/models/language/pooling_mteb_test/mteb_score_utils.py
tests/models/language/pooling_mteb_test/mteb_score_utils.py
+305
-0
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/lora/test_punica_ops.py
View file @
7e63ef82
...
...
@@ -9,7 +9,7 @@ import vllm.lora.ops.torch_ops as torch_ops
import
vllm.lora.ops.triton_ops
as
triton_ops
from
vllm.lora.ops.triton_ops
import
LoRAKernelMeta
from
vllm.lora.ops.triton_ops.utils
import
_LORA_A_PTR_DICT
,
_LORA_B_PTR_DICT
from
vllm.
platforms
import
current_platform
from
vllm.
utils.torch_utils
import
set_random_seed
from
.utils
import
PunicaTensors
,
assert_close
,
generate_data_for_nslices
...
...
@@ -395,7 +395,7 @@ def test_kernels(
Tests LoRA kernels.
"""
torch
.
set_default_device
(
device
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
if
op_type
==
"shrink"
:
check_lora_shrink_kernel
(
...
...
@@ -447,7 +447,7 @@ def test_kernels_hidden_size(
Tests SGMV and LoRA kernels.
"""
torch
.
set_default_device
(
device
)
current_platform
.
seed_everything
(
seed
)
set_random_seed
(
seed
)
if
op_type
==
"shrink"
:
check_lora_shrink_kernel
(
...
...
tests/lora/test_qwen
2
vl.py
→
tests/lora/test_qwenvl.py
View file @
7e63ef82
...
...
@@ -2,10 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
import
os
import
vllm
from
vllm.assets.image
import
ImageAsset
from
vllm.lora.request
import
LoRARequest
from
vllm.sampling_params
import
BeamSearchParams
from
..utils
import
models_path_prefix
@
dataclass
...
...
@@ -14,9 +16,12 @@ class TestConfig:
lora_path
:
str
max_num_seqs
:
int
=
2
max_loras
:
int
=
2
max_lora_rank
:
int
=
16
max_model_len
:
int
=
4096
max_lora_rank
:
int
=
32
enable_tower_connector_lora
:
bool
=
False
max_model_len
:
int
=
8192
gpu_memory_utilization
:
float
=
0.85
mm_processor_kwargs
:
dict
[
str
,
int
]
|
None
=
None
mm_processor_cache_gb
:
float
=
4
def
__post_init__
(
self
):
if
self
.
mm_processor_kwargs
is
None
:
...
...
@@ -48,8 +53,11 @@ class Qwen2VLTester:
enable_lora
=
True
,
max_loras
=
self
.
config
.
max_loras
,
max_lora_rank
=
self
.
config
.
max_lora_rank
,
enable_tower_connector_lora
=
self
.
config
.
enable_tower_connector_lora
,
trust_remote_code
=
True
,
gpu_memory_utilization
=
self
.
config
.
gpu_memory_utilization
,
mm_processor_kwargs
=
self
.
config
.
mm_processor_kwargs
,
mm_processor_cache_gb
=
self
.
config
.
mm_processor_cache_gb
,
max_model_len
=
self
.
config
.
max_model_len
,
)
...
...
@@ -58,6 +66,7 @@ class Qwen2VLTester:
images
:
list
[
ImageAsset
],
expected_outputs
:
list
[
str
],
lora_id
:
int
|
None
=
None
,
lora_name
:
str
|
None
=
None
,
temperature
:
float
=
0
,
max_tokens
:
int
=
5
,
):
...
...
@@ -73,10 +82,11 @@ class Qwen2VLTester:
for
asset
in
images
]
lora_request
=
LoRARequest
(
str
(
lora_id
),
lora_id
,
self
.
config
.
lora_path
)
lora_request
=
LoRARequest
(
lora_name
if
lora_name
else
str
(
lora_id
),
lora_id
,
self
.
config
.
lora_path
)
outputs
=
self
.
llm
.
generate
(
inputs
,
sampling_params
,
lora_request
=
lora_request
)
generated_texts
=
[
output
.
outputs
[
0
].
text
.
strip
()
for
output
in
outputs
]
# Validate outputs
for
generated
,
expected
in
zip
(
generated_texts
,
expected_outputs
):
assert
expected
.
startswith
(
generated
),
(
...
...
@@ -127,6 +137,22 @@ EXPECTED_OUTPUTS = [
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky."
,
# noqa: E501
]
EXPECTED_OUTPUTS_LANGUAGE
=
[
"A stop sign is shown in an Asian city, with buildings and a car in the "
"background."
,
"The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees."
,
]
EXPECTED_OUTPUTS_VISION
=
[
"A stop sign in front of oriental buildings."
,
"A tree with pink flowers in front of it and a blue sky behind the flowers."
,
]
EXPECTED_OUTPUTS_VISION_NO_CONNECTOR
=
[
"A stop sign is located on the street of a Chinese neighborhood."
,
"A closeup shot of the Tokyo Skytree with pink flowers in the foreground."
,
]
# NOTE - beam search .text contains the whole text
EXPECTED_BEAM_SEARCH_OUTPUTS
=
[
[
...
...
@@ -137,6 +163,7 @@ EXPECTED_BEAM_SEARCH_OUTPUTS = [
QWEN2VL_MODEL_PATH
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
QWEN25VL_MODEL_PATH
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-VL-3B-Instruct"
)
QWEN3VL_MODEL_PATH
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen3-VL-4B-Instruct"
)
def
test_qwen2vl_lora
(
qwen2vl_lora_files
):
...
...
@@ -175,3 +202,99 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
# Test with different LoRA IDs
for
lora_id
in
[
1
,
2
]:
tester
.
run_test
(
TEST_IMAGES
,
expected_outputs
=
EXPECTED_OUTPUTS
,
lora_id
=
lora_id
)
def
test_qwen25vl_vision_lora
(
qwen25vl_vision_lora_files
):
config
=
TestConfig
(
model_path
=
QWEN25VL_MODEL_PATH
,
lora_path
=
qwen25vl_vision_lora_files
,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb
=
0
,
enable_tower_connector_lora
=
True
,
)
tester
=
Qwen2VLTester
(
config
)
for
lora_id
in
[
1
,
2
]:
tester
.
run_test
(
TEST_IMAGES
,
expected_outputs
=
EXPECTED_OUTPUTS
,
lora_id
=
lora_id
,
)
def
test_qwen3vl_vision_lora
(
qwen3vl_vision_lora_files
):
config
=
TestConfig
(
model_path
=
QWEN3VL_MODEL_PATH
,
lora_path
=
qwen3vl_vision_lora_files
,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb
=
0
,
enable_tower_connector_lora
=
True
,
)
tester
=
Qwen2VLTester
(
config
)
for
lora_id
in
[
1
,
2
]:
tester
.
run_test
(
TEST_IMAGES
,
expected_outputs
=
EXPECTED_OUTPUTS
,
lora_id
=
lora_id
,
)
def
test_qwen2vl_multiple_lora_types
(
qwen2vl_language_lora_files
,
qwen2vl_vision_tower_connector_lora_files
,
qwen2vl_vision_tower_lora_files
,
):
"""
Test multiple LoRA adapter types (language, vision tower + connector,
vision tower only) using the same LLM instance to verify mm_encoder_cache
behavior with different LoRA requests.
By reusing the same LLM instance across different LoRA requests, we ensure that
the multimodal encoder cache correctly manages state transitions between
language-only and vision-enabled LoRA adapters.
"""
config
=
TestConfig
(
model_path
=
QWEN2VL_MODEL_PATH
,
# We'll override the lora_path for each specific test, but need to provide
# an initial path for initialization
lora_path
=
qwen2vl_language_lora_files
,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb
=
0
,
enable_tower_connector_lora
=
True
,
)
tester
=
Qwen2VLTester
(
config
)
# Test 1: Language-only LoRA adapter
tester
.
config
.
lora_path
=
qwen2vl_language_lora_files
for
lora_id
in
[
1
,
2
]:
tester
.
run_test
(
TEST_IMAGES
,
expected_outputs
=
EXPECTED_OUTPUTS_LANGUAGE
,
lora_id
=
lora_id
,
lora_name
=
"language_only"
,
)
# Test 2: Vision tower + connector LoRA adapter
tester
.
config
.
lora_path
=
qwen2vl_vision_tower_connector_lora_files
for
lora_id
in
[
3
,
4
]:
tester
.
run_test
(
TEST_IMAGES
,
expected_outputs
=
EXPECTED_OUTPUTS_VISION
,
lora_id
=
lora_id
,
lora_name
=
"vision_tower_connector"
,
)
# Test 3: Vision tower only LoRA adapter (no connector)
tester
.
config
.
lora_path
=
qwen2vl_vision_tower_lora_files
for
lora_id
in
[
5
,
6
]:
tester
.
run_test
(
TEST_IMAGES
,
expected_outputs
=
EXPECTED_OUTPUTS_VISION_NO_CONNECTOR
,
lora_id
=
lora_id
,
lora_name
=
"vision_tower"
,
)
tests/lora/test_utils.py
View file @
7e63ef82
...
...
@@ -3,7 +3,7 @@
from
collections
import
OrderedDict
from
typing
import
NamedTuple
from
unittest.mock
import
patch
from
unittest.mock
import
MagicMock
,
patch
import
pytest
from
huggingface_hub.utils
import
HfHubHTTPError
...
...
@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
# Hugging Face model identifier with download error
path
=
"org/repo"
mock_exist
.
return_value
=
False
mock_snapshot_download
.
side_effect
=
HfHubHTTPError
(
"failed to query model info"
)
mock_snapshot_download
.
side_effect
=
HfHubHTTPError
(
"failed to query model info"
,
response
=
MagicMock
(),
)
assert
get_adapter_absolute_path
(
path
)
==
path
tests/model_executor/model_loader/runai_streamer_loader/conftest.py
View file @
7e63ef82
...
...
@@ -29,11 +29,7 @@ class RunaiDummyExecutor(UniProcExecutor):
is_driver_worker
=
is_driver_worker
,
)
wrapper_kwargs
=
{
"vllm_config"
:
self
.
vllm_config
,
}
self
.
driver_worker
=
WorkerWrapperBase
(
**
wrapper_kwargs
)
self
.
driver_worker
=
WorkerWrapperBase
()
self
.
collective_rpc
(
"init_worker"
,
args
=
([
worker_rpc_kwargs
],))
self
.
collective_rpc
(
"init_device"
)
tests/model_executor/model_loader/tensorizer_loader/conftest.py
View file @
7e63ef82
...
...
@@ -67,7 +67,7 @@ def assert_from_collective_rpc(engine: LLM, closure: Callable, closure_kwargs: d
class
DummyExecutor
(
UniProcExecutor
):
def
_init_executor
(
self
)
->
None
:
"""Initialize the worker and load the model."""
self
.
driver_worker
=
WorkerWrapperBase
(
vllm_config
=
self
.
vllm_config
,
rpc_rank
=
0
)
self
.
driver_worker
=
WorkerWrapperBase
(
rpc_rank
=
0
)
distributed_init_method
=
get_distributed_init_method
(
get_ip
(),
get_open_port
())
local_rank
=
0
# set local rank as the device index if specified
...
...
tests/model_executor/test_eagle_quantization.py
View file @
7e63ef82
...
...
@@ -55,7 +55,7 @@ def test_get_draft_quant_config_without_draft_model():
@
torch
.
inference_mode
()
@
pytest
.
mark
.
parametrize
(
"device"
,
DEVICES
)
def
test_fc_layer_quant_config_usage
(
dist_init
,
device
)
->
None
:
def
test_fc_layer_quant_config_usage
(
default_vllm_config
,
dist_init
,
device
)
->
None
:
import
torch
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
...
...
tests/model_executor/test_model_load_with_params.py
View file @
7e63ef82
...
...
@@ -5,12 +5,8 @@ import os
import
pytest
from
vllm.model_executor.layers.pooler
import
(
CLSPool
,
DispatchPooler
,
MeanPool
,
PoolingType
,
)
from
vllm.model_executor.layers.pooler
import
DispatchPooler
from
vllm.model_executor.layers.pooler.seqwise
import
CLSPool
,
MeanPool
from
vllm.model_executor.models.bert
import
BertEmbeddingModel
from
vllm.model_executor.models.roberta
import
RobertaEmbeddingModel
from
vllm.platforms
import
current_platform
...
...
@@ -51,8 +47,9 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
assert
model_config
.
encoder_config
[
"do_lower_case"
]
# asserts on the pooling config files
assert
model_config
.
pooler_config
.
pooling_type
==
PoolingType
.
CLS
.
name
assert
model_config
.
pooler_config
.
normalize
assert
model_config
.
pooler_config
.
seq_pooling_type
==
"CLS"
assert
model_config
.
pooler_config
.
tok_pooling_type
==
"ALL"
assert
model_config
.
pooler_config
.
use_activation
# asserts on the tokenizer loaded
assert
model_config
.
tokenizer
==
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-base-en-v1.5"
)
...
...
@@ -95,8 +92,9 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
assert
not
model_config
.
encoder_config
[
"do_lower_case"
]
# asserts on the pooling config files
assert
model_config
.
pooler_config
.
pooling_type
==
PoolingType
.
MEAN
.
name
assert
model_config
.
pooler_config
.
normalize
assert
model_config
.
pooler_config
.
seq_pooling_type
==
"MEAN"
assert
model_config
.
pooler_config
.
tok_pooling_type
==
"ALL"
assert
model_config
.
pooler_config
.
use_activation
# asserts on the tokenizer loaded
assert
model_config
.
tokenizer
==
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-base"
)
...
...
tests/models/fixtures/qwen2_5_math_prm_reward_step.json
0 → 100644
View file @
7e63ef82
[[[
0.0006361007690429688
,
0.99951171875
],
[
0.81884765625
,
0.1812744140625
],
[
0.025543212890625
,
0.974609375
],
[
0.0004382133483886719
,
0.99951171875
]]]
\ No newline at end of file
tests/models/language/generation/conftest.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM language generation tests."""
import
warnings
import
torch
from
vllm.platforms
import
current_platform
def
pytest_sessionstart
(
session
):
"""Configure ROCm-specific settings before test session starts."""
if
not
current_platform
.
is_rocm
():
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch
.
backends
.
cuda
.
enable_flash_sdp
(
False
)
torch
.
backends
.
cuda
.
enable_mem_efficient_sdp
(
False
)
torch
.
backends
.
cuda
.
enable_math_sdp
(
True
)
warnings
.
warn
(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues"
,
UserWarning
,
stacklevel
=
1
,
)
tests/models/language/generation/test_common.py
View file @
7e63ef82
...
...
@@ -12,6 +12,11 @@ from ...registry import HF_EXAMPLE_MODELS
from
...utils
import
check_logprobs_close
from
....utils
import
models_path_prefix
# Models that require embedding scaling for prompt_embeds test
EMBED_SCALING_MODELS
=
{
"openbmb/MiniCPM4.1-8B"
,
}
# This list contains the model that are using AITER kernel.
# Skip model that are not using AITER tests.
# When more AITER kernels are added, this list will not be
...
...
@@ -66,8 +71,8 @@ AITER_MODEL_LIST = [
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM
3-4B"
),
marks
=
[
pytest
.
mark
.
core_model
,
large_gpu_mark
(
min_gb
=
32
)],
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM
4.1-8B"
),
# minicpm
marks
=
[
pytest
.
mark
.
core_model
,
large_gpu_mark
(
min_gb
=
48
)],
),
pytest
.
param
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
# opt
...
...
@@ -137,16 +142,20 @@ def test_models(
prompt_embeds
:
list
[
torch
.
Tensor
]
|
None
=
[]
if
use_prompt_embeds
else
None
prompt_token_ids
=
[]
for
prompt
in
example_prompts
:
token_ids
=
hf_model
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
input_ids
.
to
(
hf_model
.
model
.
device
)
prompt_token_ids
.
append
(
token_ids
)
if
prompt_embeds
is
not
None
:
prompt_embeds
.
append
(
hf_model
.
model
.
get_input_embeddings
()(
token_ids
).
squeeze
(
0
)
)
embed
=
hf_model
.
model
.
get_input_embeddings
()(
token_ids
)
# MiniCPM models apply scale_emb to embeddings internally.
# vLLM expects pre-scaled embeddings when using inputs_embeds.
if
model
in
EMBED_SCALING_MODELS
:
config
=
hf_model
.
model
.
config
embed
=
embed
*
config
.
scale_emb
prompt_embeds
.
append
(
embed
.
squeeze
(
0
))
with
vllm_runner
(
model
,
...
...
tests/models/language/generation/test_grok.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
...utils
import
dummy_hf_overrides
MODELS
=
[
"xai-org/grok-2"
]
def
_grok2_dummy_overrides
(
hf_config
):
hf_config
=
dummy_hf_overrides
(
hf_config
,
model_arch
=
"Grok1ForCausalLM"
)
text_config
=
hf_config
.
get_text_config
()
text_config
.
update
(
{
"hidden_size"
:
256
,
"intermediate_size"
:
512
,
"moe_intermediate_size"
:
256
,
"num_attention_heads"
:
4
,
"num_key_value_heads"
:
2
,
"head_dim"
:
64
,
}
)
return
hf_config
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
test_dummy_generate
(
vllm_runner
,
monkeypatch
,
model
:
str
)
->
None
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
with
vllm_runner
(
model
,
load_format
=
"dummy"
,
max_model_len
=
128
,
hf_overrides
=
_grok2_dummy_overrides
,
enforce_eager
=
True
,
)
as
llm
:
prompt
=
"Hello from Grok-2"
tokenizer
=
llm
.
get_llm
().
get_tokenizer
()
prompt_len
=
len
(
tokenizer
.
encode
(
prompt
))
outputs
=
llm
.
generate_greedy
([
prompt
],
max_tokens
=
1
)
output_ids
,
output_str
=
outputs
[
0
]
assert
len
(
output_ids
)
>
prompt_len
assert
output_str
is
not
None
tests/models/language/generation/test_phimoe.py
View file @
7e63ef82
...
...
@@ -62,6 +62,19 @@ def test_phimoe_routing_function():
assert
torch
.
equal
(
topk_ids
,
ground_truth
[
test_id
][
"topk_ids"
])
# There is a known issue that triggers `AttributeError: 'DynamicCache'
# object has no attribute 'seen_tokens'` when running:
# `tests/models/language/generation/test_phimoe.py::test_models
# [5-64-bfloat16-microsoft/Phi-3.5-MoE-instruct]`
# This issue is being investigated and tracked in:
# https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58
# It is platform-agnostic. Therefore, we skip this test on all platforms for now.
@
pytest
.
mark
.
skip
(
reason
=
"Skipping due to known issue: "
"'DynamicCache' object has no attribute 'seen_tokens'. See: "
"https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58 "
"for details."
,
)
@
pytest
.
mark
.
skipif
(
condition
=
current_platform
.
is_cpu
(),
reason
=
"This test takes a lot time to run on CPU, "
...
...
tests/models/language/pooling/conftest.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM language generation tests."""
import
warnings
import
torch
from
vllm.platforms
import
current_platform
def
pytest_sessionstart
(
session
):
"""Configure ROCm-specific settings before test session starts."""
if
not
current_platform
.
is_rocm
():
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch
.
backends
.
cuda
.
enable_flash_sdp
(
False
)
torch
.
backends
.
cuda
.
enable_mem_efficient_sdp
(
False
)
torch
.
backends
.
cuda
.
enable_math_sdp
(
True
)
torch
.
set_float32_matmul_precision
(
"high"
)
warnings
.
warn
(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues"
,
UserWarning
,
stacklevel
=
1
,
)
tests/models/language/pooling/test_embedding.py
View file @
7e63ef82
...
...
@@ -61,7 +61,7 @@ def test_models(
vllm_extra_kwargs
=
{}
if
model
==
(
os
.
path
.
join
(
models_path_prefix
,
"ssmits/Qwen2-7B-Instruct-embed-base"
):
vllm_extra_kwargs
[
"pooler_config"
]
=
PoolerConfig
(
pooling_type
=
"MEAN"
,
normalize
=
False
seq_
pooling_type
=
"MEAN"
,
normalize
=
False
)
max_model_len
:
int
|
None
=
512
...
...
tests/models/language/pooling/test_mm_classifier_conversion.py
View file @
7e63ef82
...
...
@@ -88,7 +88,7 @@ def test_gemma_multimodal(
convert
=
"classify"
,
load_format
=
"auto"
,
hf_overrides
=
update_config
,
pooler_config
=
PoolerConfig
(
pooling_type
=
"LAST"
),
pooler_config
=
PoolerConfig
(
seq_
pooling_type
=
"LAST"
),
max_model_len
=
512
,
enforce_eager
=
True
,
tensor_parallel_size
=
1
,
...
...
tests/models/language/pooling/test_pooler_config_init_behaviour.py
View file @
7e63ef82
...
...
@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
model
,
max_model_len
=
512
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
normalize
=
False
),
pooler_config
=
PoolerConfig
(
use_activation
=
False
),
)
as
vllm_model
:
wo_normalize
=
torch
.
tensor
(
vllm_model
.
embed
(
example_prompts
))
...
...
@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
model
,
max_model_len
=
512
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
normalize
=
True
),
pooler_config
=
PoolerConfig
(
use_activation
=
True
),
)
as
vllm_model
:
w_normalize
=
torch
.
tensor
(
vllm_model
.
embed
(
example_prompts
))
...
...
@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model
,
max_model_len
=
512
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
normalize
=
False
),
pooler_config
=
PoolerConfig
(
use_activation
=
False
),
)
as
vllm_model
:
wo_normalize
=
vllm_model
.
token_embed
(
example_prompts
)
...
...
@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model
,
max_model_len
=
512
,
dtype
=
dtype
,
pooler_config
=
PoolerConfig
(
normalize
=
True
),
pooler_config
=
PoolerConfig
(
use_activation
=
True
),
)
as
vllm_model
:
w_normalize
=
vllm_model
.
token_embed
(
example_prompts
)
...
...
tests/models/language/pooling/test_reward.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
from
typing
import
TYPE_CHECKING
import
pytest
import
torch
...
...
@@ -9,7 +11,18 @@ from transformers import AutoModel
from
vllm.platforms
import
current_platform
from
....conftest
import
HfRunner
from
...utils
import
check_transformers_version
from
....utils
import
VLLM_PATH
from
...registry
import
HF_EXAMPLE_MODELS
if
TYPE_CHECKING
:
from
_typeshed
import
StrPath
FIXTURES_PATH
=
VLLM_PATH
/
"tests/models/fixtures"
assert
FIXTURES_PATH
.
exists
()
FIXTURE_REWARD_RESULT
=
{
"Qwen/Qwen2.5-Math-PRM-7B"
:
FIXTURES_PATH
/
"qwen2_5_math_prm_reward_step.json"
,
}
@
pytest
.
fixture
...
...
@@ -60,6 +73,16 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
return
hf_model
def
dump_reward_outputs
(
outputs
:
list
[
list
[
float
]],
filename
:
"StrPath"
):
with
open
(
filename
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
outputs
,
f
)
def
load_reward_outputs
(
filename
:
"StrPath"
)
->
list
[
list
[
float
]]:
with
open
(
filename
,
encoding
=
"utf-8"
)
as
f
:
return
json
.
load
(
f
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
...
...
@@ -77,9 +100,8 @@ def test_prm_models(
model
:
str
,
dtype
:
str
,
)
->
None
:
check_transformers_version
(
"Qwen/Qwen2.5-Math-PRM-7B"
,
max_transformers_version
=
"4.53.2"
)
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
if
current_platform
.
is_cpu
():
pytest
.
skip
(
"CPU only supports V1"
)
...
...
@@ -91,9 +113,46 @@ def test_prm_models(
hf_model
=
step_reward_patch_hf_model
(
hf_model
)
hf_outputs
=
hf_model
.
reward
(
math_step_prompts
)
dump_reward_outputs
(
hf_outputs
,
FIXTURE_REWARD_RESULT
[
model
],
)
# check logits difference
for
hf_output
,
vllm_output
in
zip
(
hf_outputs
,
vllm_outputs
):
hf_output
=
torch
.
tensor
(
hf_output
).
float
()
vllm_output
=
torch
.
tensor
(
vllm_output
).
float
()
assert
torch
.
allclose
(
hf_output
,
vllm_output
,
1.5e-2
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
pytest
.
param
(
"Qwen/Qwen2.5-Math-PRM-7B"
,
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_prm_models_with_golden_outputs
(
vllm_runner
,
math_step_prompts
,
model
:
str
,
dtype
:
str
,
)
->
None
:
if
not
FIXTURE_REWARD_RESULT
.
get
(
model
):
pytest
.
skip
(
f
"No available golden outputs for
{
model
}
."
)
with
vllm_runner
(
model
,
max_model_len
=
1024
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
reward
(
math_step_prompts
)
golden_outputs
=
load_reward_outputs
(
FIXTURE_REWARD_RESULT
[
model
])
# check logits difference
for
golden_output
,
vllm_output
in
zip
(
golden_outputs
,
vllm_outputs
):
golden_output
=
torch
.
tensor
(
golden_output
).
float
()
vllm_output
=
torch
.
tensor
(
vllm_output
).
float
()
assert
torch
.
allclose
(
golden_output
,
vllm_output
,
1.5e-2
)
tests/models/language/pooling/test_token_classification.py
View file @
7e63ef82
...
...
@@ -5,6 +5,7 @@ import torch
from
transformers
import
AutoModelForTokenClassification
from
tests.models.utils
import
softmax
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"boltuix/NeuroBERT-NER"
])
...
...
@@ -21,8 +22,17 @@ def test_bert_models(
with
vllm_runner
(
model
,
max_model_len
=
None
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
token_classify
(
example_prompts
)
# Use eager attention on ROCm to avoid HF Transformers flash attention
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
hf_model_kwargs
=
{}
if
current_platform
.
is_rocm
():
hf_model_kwargs
[
"attn_implementation"
]
=
"eager"
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForTokenClassification
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForTokenClassification
,
model_kwargs
=
hf_model_kwargs
,
)
as
hf_model
:
tokenizer
=
hf_model
.
tokenizer
hf_outputs
=
[]
...
...
@@ -34,9 +44,9 @@ def test_bert_models(
# check logits difference
for
hf_output
,
vllm_output
in
zip
(
hf_outputs
,
vllm_outputs
):
hf_output
=
torch
.
tensor
(
hf_output
).
cpu
().
float
()
vllm_output
=
torch
.
tensor
(
vllm_output
).
cpu
().
float
()
assert
torch
.
all
close
(
hf_output
,
vllm_output
,
1e-2
)
hf_output
=
hf_output
.
detach
().
clone
(
).
cpu
().
float
()
vllm_output
=
vllm_output
.
detach
().
clone
(
).
cpu
().
float
()
torch
.
testing
.
assert_
close
(
hf_output
,
vllm_output
,
atol
=
1.2e-2
,
rtol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"disham993/electrical-ner-ModernBERT-base"
])
...
...
@@ -52,8 +62,17 @@ def test_modernbert_models(
with
vllm_runner
(
model
,
max_model_len
=
None
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
token_classify
(
example_prompts
)
# Use eager attention on ROCm to avoid HF Transformers flash attention
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
hf_model_kwargs
=
{}
if
current_platform
.
is_rocm
():
hf_model_kwargs
[
"attn_implementation"
]
=
"eager"
with
hf_runner
(
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForTokenClassification
model
,
dtype
=
dtype
,
auto_cls
=
AutoModelForTokenClassification
,
model_kwargs
=
hf_model_kwargs
,
)
as
hf_model
:
tokenizer
=
hf_model
.
tokenizer
hf_outputs
=
[]
...
...
@@ -65,9 +84,9 @@ def test_modernbert_models(
# check logits difference
for
hf_output
,
vllm_output
in
zip
(
hf_outputs
,
vllm_outputs
):
hf_output
=
torch
.
tensor
(
hf_output
).
cpu
().
float
()
vllm_output
=
torch
.
tensor
(
vllm_output
).
cpu
().
float
()
assert
torch
.
all
close
(
hf_output
,
vllm_output
,
atol
=
1e-2
)
hf_output
=
hf_output
.
detach
().
clone
(
).
cpu
().
float
()
vllm_output
=
vllm_output
.
detach
().
clone
(
).
cpu
().
float
()
torch
.
testing
.
assert_
close
(
hf_output
,
vllm_output
,
atol
=
1
.2
e-2
,
rtol
=
1e-3
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"bd2lcco/Qwen3-0.6B-finetuned"
])
...
...
@@ -96,6 +115,6 @@ def test_auto_conversion(
# check logits difference
for
hf_output
,
vllm_output
in
zip
(
hf_outputs
,
vllm_outputs
):
hf_output
=
torch
.
tensor
(
hf_output
).
cpu
().
float
()
vllm_output
=
torch
.
tensor
(
vllm_output
).
cpu
().
float
()
hf_output
=
hf_output
.
detach
().
clone
(
).
cpu
().
float
()
vllm_output
=
vllm_output
.
detach
().
clone
(
).
cpu
().
float
()
assert
torch
.
allclose
(
hf_output
,
vllm_output
,
atol
=
1e-2
)
tests/models/language/pooling_mteb_test/mteb_utils.py
→
tests/models/language/pooling_mteb_test/mteb_
embed_
utils.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
tempfile
import
mteb
import
numpy
as
np
import
requests
import
torch
from
mteb.models
import
ModelMeta
from
mteb.types
import
Array
...
...
@@ -14,7 +11,6 @@ from torch.utils.data import DataLoader
import
tests.ci_envs
as
ci_envs
from
tests.models.utils
import
(
EmbedModelInfo
,
RerankModelInfo
,
check_embeddings_close
,
get_vllm_extra_kwargs
,
)
...
...
@@ -23,14 +19,10 @@ from tests.models.utils import (
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
#
1
e-4 is a good tolerance threshold
#
5
e-4 is a good tolerance threshold
MTEB_EMBED_TASKS
=
[
"STS12"
]
MTEB_EMBED_TOL
=
1
e-4
MTEB_EMBED_TOL
=
5
e-4
# See #19344
MTEB_RERANK_TASKS
=
[
"NFCorpus"
]
MTEB_RERANK_LANGS
=
[
"eng"
]
MTEB_RERANK_TOL
=
2e-3
_empty_model_meta
=
ModelMeta
(
loader
=
None
,
...
...
@@ -54,29 +46,9 @@ _empty_model_meta = ModelMeta(
)
class
Vllm
MtebE
ncoder
(
mteb
.
EncoderProtocol
):
class
MtebE
mbedMixin
(
mteb
.
EncoderProtocol
):
mteb_model_meta
=
_empty_model_meta
def
__init__
(
self
,
vllm_model
):
self
.
llm
=
vllm_model
self
.
rng
=
np
.
random
.
default_rng
(
seed
=
42
)
def
encode
(
self
,
inputs
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences
=
[
text
for
batch
in
inputs
for
text
in
batch
[
"text"
]]
r
=
self
.
rng
.
permutation
(
len
(
sentences
))
sentences
=
[
sentences
[
i
]
for
i
in
r
]
outputs
=
self
.
llm
.
embed
(
sentences
,
use_tqdm
=
False
)
embeds
=
np
.
array
(
outputs
)
embeds
=
embeds
[
np
.
argsort
(
r
)]
return
embeds
def
similarity
(
self
,
embeddings1
:
np
.
ndarray
,
...
...
@@ -102,31 +74,29 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
return
sim
class
VllmMtebCrossEncoder
(
mteb
.
CrossEncoderProtocol
):
mteb_model_meta
=
_empty_model_meta
class
VllmMtebEncoder
(
MtebEmbedMixin
):
def
__init__
(
self
,
vllm_model
):
self
.
llm
=
vllm_model
self
.
rng
=
np
.
random
.
default_rng
(
seed
=
42
)
def
predict
(
def
encode
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
text
for
batch
in
inputs1
for
text
in
batch
[
"text"
]]
corpus
=
[
text
for
batch
in
inputs2
for
text
in
batch
[
"text"
]]
outputs
=
self
.
llm
.
score
(
queries
,
corpus
,
truncate_prompt_tokens
=-
1
,
use_tqdm
=
False
)
scores
=
np
.
array
(
outputs
)
return
scores
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences
=
[
text
for
batch
in
inputs
for
text
in
batch
[
"text"
]]
r
=
self
.
rng
.
permutation
(
len
(
sentences
))
sentences
=
[
sentences
[
i
]
for
i
in
r
]
outputs
=
self
.
llm
.
embed
(
sentences
,
use_tqdm
=
False
)
embeds
=
np
.
array
(
outputs
)
embeds
=
embeds
[
np
.
argsort
(
r
)]
return
embeds
class
OpenAIClientMtebEncoder
(
Vllm
MtebE
ncoder
):
class
OpenAIClientMtebEncoder
(
MtebE
mbedMixin
):
def
__init__
(
self
,
model_name
:
str
,
client
):
self
.
model_name
=
model_name
self
.
client
=
client
...
...
@@ -153,58 +123,6 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
return
embeds
class
ScoreClientMtebEncoder
(
mteb
.
CrossEncoderProtocol
):
mteb_model_meta
=
_empty_model_meta
def
__init__
(
self
,
model_name
:
str
,
url
):
self
.
model_name
=
model_name
self
.
url
=
url
self
.
rng
=
np
.
random
.
default_rng
(
seed
=
42
)
def
predict
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
text
for
batch
in
inputs1
for
text
in
batch
[
"text"
]]
full_corpus
=
[
text
for
batch
in
inputs2
for
text
in
batch
[
"text"
]]
outputs
=
[]
for
query
,
corpus
in
zip
(
queries
,
full_corpus
):
outputs
.
append
(
self
.
get_score
(
query
,
corpus
))
scores
=
np
.
array
(
outputs
)
return
scores
def
get_score
(
self
,
query
,
corpus
):
response
=
requests
.
post
(
self
.
url
,
json
=
{
"model"
:
self
.
model_name
,
"text_1"
:
query
,
"text_2"
:
corpus
,
"truncate_prompt_tokens"
:
-
1
,
},
).
json
()
return
response
[
"data"
][
0
][
"score"
]
class
RerankClientMtebEncoder
(
ScoreClientMtebEncoder
):
def
get_score
(
self
,
query
,
corpus
):
response
=
requests
.
post
(
self
.
url
,
json
=
{
"model"
:
self
.
model_name
,
"query"
:
query
,
"documents"
:
[
corpus
],
"truncate_prompt_tokens"
:
-
1
,
},
).
json
()
return
response
[
"results"
][
0
][
"relevance_score"
]
def
run_mteb_embed_task
(
encoder
:
mteb
.
EncoderProtocol
,
tasks
):
tasks
=
mteb
.
get_tasks
(
tasks
=
tasks
)
results
=
mteb
.
evaluate
(
...
...
@@ -243,12 +161,24 @@ def mteb_test_embed_models(
if
model_info
.
architecture
:
assert
model_info
.
architecture
in
model_config
.
architectures
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert
(
model_config
.
_model_info
.
default_pooling_type
==
model_info
.
default_pooling_type
)
# Confirm whether the important configs in model_config are correct.
pooler_config
=
model_config
.
pooler_config
if
model_info
.
seq_pooling_type
is
not
None
:
assert
pooler_config
.
seq_pooling_type
==
model_info
.
seq_pooling_type
if
model_info
.
tok_pooling_type
is
not
None
:
assert
pooler_config
.
tok_pooling_type
==
model_info
.
tok_pooling_type
if
model_info
.
attn_type
is
not
None
:
assert
model_config
.
attn_type
==
model_info
.
attn_type
if
model_info
.
is_prefix_caching_supported
is
not
None
:
assert
(
model_config
.
is_prefix_caching_supported
==
model_info
.
is_prefix_caching_supported
)
if
model_info
.
is_chunked_prefill_supported
is
not
None
:
assert
(
model_config
.
is_chunked_prefill_supported
==
model_info
.
is_chunked_prefill_supported
)
vllm_main_score
=
run_mteb_embed_task
(
VllmMtebEncoder
(
vllm_model
),
MTEB_EMBED_TASKS
...
...
@@ -299,117 +229,3 @@ def mteb_test_embed_models(
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
atol
def
run_mteb_rerank
(
cross_encoder
:
mteb
.
CrossEncoderProtocol
,
tasks
,
languages
):
with
tempfile
.
TemporaryDirectory
()
as
prediction_folder
:
bm25s
=
mteb
.
get_model
(
"bm25s"
)
eval_splits
=
[
"test"
]
mteb_tasks
:
list
[
mteb
.
abstasks
.
AbsTaskRetrieval
]
=
mteb
.
get_tasks
(
tasks
=
tasks
,
languages
=
languages
,
eval_splits
=
eval_splits
)
mteb
.
evaluate
(
bm25s
,
mteb_tasks
,
prediction_folder
=
prediction_folder
,
show_progress_bar
=
False
,
# don't save results for test runs
cache
=
None
,
overwrite_strategy
=
"always"
,
)
second_stage_tasks
=
[]
for
task
in
mteb_tasks
:
second_stage_tasks
.
append
(
task
.
convert_to_reranking
(
prediction_folder
,
top_k
=
10
,
)
)
results
=
mteb
.
evaluate
(
cross_encoder
,
second_stage_tasks
,
show_progress_bar
=
False
,
cache
=
None
,
)
main_score
=
results
[
0
].
scores
[
"test"
][
0
][
"main_score"
]
return
main_score
def
mteb_test_rerank_models_hf
(
hf_runner
,
model_name
,
hf_dtype
=
"float32"
,
hf_model_callback
=
None
):
with
hf_runner
(
model_name
,
is_cross_encoder
=
True
,
dtype
=
hf_dtype
)
as
hf_model
:
if
hf_model_callback
is
not
None
:
hf_model_callback
(
hf_model
)
st_main_score
=
run_mteb_rerank
(
hf_model
,
tasks
=
MTEB_RERANK_TASKS
,
languages
=
MTEB_RERANK_LANGS
)
st_dtype
=
next
(
hf_model
.
model
.
model
.
parameters
()).
dtype
return
st_main_score
,
st_dtype
def
mteb_test_rerank_models
(
hf_runner
,
vllm_runner
,
model_info
:
RerankModelInfo
,
vllm_extra_kwargs
=
None
,
hf_model_callback
=
None
,
vllm_mteb_encoder
=
VllmMtebCrossEncoder
,
atol
=
MTEB_RERANK_TOL
,
):
vllm_extra_kwargs
=
get_vllm_extra_kwargs
(
model_info
,
vllm_extra_kwargs
)
with
vllm_runner
(
model_info
.
name
,
runner
=
"pooling"
,
max_model_len
=
None
,
max_num_seqs
=
8
,
**
vllm_extra_kwargs
,
)
as
vllm_model
:
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
# Confirm whether vllm is using the correct architecture
if
model_info
.
architecture
:
assert
model_info
.
architecture
in
model_config
.
architectures
# Score API is only enabled for num_labels == 1
assert
model_config
.
hf_config
.
num_labels
==
1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert
(
model_config
.
_model_info
.
default_pooling_type
==
model_info
.
default_pooling_type
)
vllm_main_score
=
run_mteb_rerank
(
vllm_mteb_encoder
(
vllm_model
),
tasks
=
MTEB_RERANK_TASKS
,
languages
=
MTEB_RERANK_LANGS
,
)
vllm_dtype
=
model_config
.
dtype
head_dtype
=
model_config
.
head_dtype
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if
model_info
.
mteb_score
is
None
:
st_main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
hf_runner
,
model_info
.
name
,
model_info
.
hf_dtype
,
hf_model_callback
)
else
:
st_main_score
=
model_info
.
mteb_score
st_dtype
=
"Constant"
print
(
"Model:"
,
model_info
.
name
)
print
(
"VLLM:"
,
f
"dtype:
{
vllm_dtype
}
"
,
f
"head_dtype:
{
head_dtype
}
"
,
vllm_main_score
)
print
(
"SentenceTransformers:"
,
st_dtype
,
st_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
atol
tests/models/language/pooling_mteb_test/mteb_score_utils.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
tempfile
from
pathlib
import
Path
from
typing
import
Any
import
mteb
import
numpy
as
np
import
requests
import
torch
from
mteb.models
import
ModelMeta
from
torch.utils.data
import
DataLoader
from
tests.conftest
import
HfRunner
from
tests.models.utils
import
(
RerankModelInfo
,
get_vllm_extra_kwargs
,
)
# See #19344
MTEB_RERANK_TASKS
=
[
"NFCorpus"
]
MTEB_RERANK_LANGS
=
[
"eng"
]
MTEB_RERANK_TOL
=
2e-3
template_home
=
(
Path
(
__file__
).
parent
.
parent
.
parent
.
parent
.
parent
/
"examples/pooling/score/template"
)
_empty_model_meta
=
ModelMeta
(
loader
=
None
,
name
=
"vllm/model"
,
revision
=
"1"
,
release_date
=
None
,
languages
=
None
,
framework
=
[],
similarity_fn_name
=
None
,
n_parameters
=
None
,
memory_usage_mb
=
None
,
max_tokens
=
None
,
embed_dim
=
None
,
license
=
None
,
open_weights
=
None
,
public_training_code
=
None
,
public_training_data
=
None
,
use_instructions
=
None
,
training_datasets
=
None
,
modalities
=
[
"text"
],
# 'image' can be added to evaluate multimodal models
)
class
MtebCrossEncoderMixin
(
mteb
.
CrossEncoderProtocol
):
mteb_model_meta
=
_empty_model_meta
class
VllmMtebCrossEncoder
(
MtebCrossEncoderMixin
):
def
__init__
(
self
,
vllm_model
):
self
.
llm
=
vllm_model
self
.
rng
=
np
.
random
.
default_rng
(
seed
=
42
)
self
.
chat_template
:
str
|
None
=
getattr
(
vllm_model
,
"chat_template"
,
None
)
def
predict
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
text
for
batch
in
inputs1
for
text
in
batch
[
"text"
]]
corpus
=
[
text
for
batch
in
inputs2
for
text
in
batch
[
"text"
]]
# Hoping to discover potential scheduling
# issues by randomizing the order.
r
=
self
.
rng
.
permutation
(
len
(
queries
))
queries
=
[
queries
[
i
]
for
i
in
r
]
corpus
=
[
corpus
[
i
]
for
i
in
r
]
outputs
=
self
.
llm
.
score
(
queries
,
corpus
,
truncate_prompt_tokens
=-
1
,
use_tqdm
=
False
,
chat_template
=
self
.
chat_template
,
)
scores
=
np
.
array
(
outputs
)
scores
=
scores
[
np
.
argsort
(
r
)]
return
scores
class
ScoreClientMtebEncoder
(
MtebCrossEncoderMixin
):
mteb_model_meta
=
_empty_model_meta
def
__init__
(
self
,
model_name
:
str
,
url
):
self
.
model_name
=
model_name
self
.
url
=
url
def
predict
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
text
for
batch
in
inputs1
for
text
in
batch
[
"text"
]]
full_corpus
=
[
text
for
batch
in
inputs2
for
text
in
batch
[
"text"
]]
outputs
=
[]
for
query
,
corpus
in
zip
(
queries
,
full_corpus
):
outputs
.
append
(
self
.
get_score
(
query
,
corpus
))
scores
=
np
.
array
(
outputs
)
return
scores
def
get_score
(
self
,
query
,
corpus
):
response
=
requests
.
post
(
self
.
url
,
json
=
{
"model"
:
self
.
model_name
,
"text_1"
:
query
,
"text_2"
:
corpus
,
"truncate_prompt_tokens"
:
-
1
,
},
).
json
()
return
response
[
"data"
][
0
][
"score"
]
class
RerankClientMtebEncoder
(
ScoreClientMtebEncoder
):
def
get_score
(
self
,
query
,
corpus
):
response
=
requests
.
post
(
self
.
url
,
json
=
{
"model"
:
self
.
model_name
,
"query"
:
query
,
"documents"
:
[
corpus
],
"truncate_prompt_tokens"
:
-
1
,
},
).
json
()
return
response
[
"results"
][
0
][
"relevance_score"
]
class
HFMtebCrossEncoder
(
MtebCrossEncoderMixin
,
HfRunner
):
chat_template
:
str
|
None
=
None
def
__init__
(
self
,
model_name
:
str
,
dtype
:
str
=
"auto"
,
**
kwargs
:
Any
)
->
None
:
HfRunner
.
__init__
(
self
,
model_name
=
model_name
,
is_cross_encoder
=
True
,
dtype
=
dtype
,
**
kwargs
)
@
torch
.
no_grad
def
predict
(
self
,
inputs1
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
inputs2
:
DataLoader
[
mteb
.
types
.
BatchedInput
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
queries
=
[
text
for
batch
in
inputs1
for
text
in
batch
[
"text"
]]
corpus
=
[
text
for
batch
in
inputs2
for
text
in
batch
[
"text"
]]
if
self
.
chat_template
is
not
None
:
tokenizer
=
self
.
model
.
tokenizer
prompts
=
[]
for
query
,
document
in
zip
(
queries
,
corpus
):
conversation
=
[
{
"role"
:
"query"
,
"content"
:
query
},
{
"role"
:
"document"
,
"content"
:
document
},
]
prompt
=
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
tools
=
None
,
chat_template
=
self
.
chat_template
,
tokenize
=
False
,
)
prompts
.
append
(
prompt
)
outputs_list
=
HfRunner
.
classify
(
self
,
prompts
)
scores
=
np
.
array
(
outputs_list
).
squeeze
(
-
1
)
return
scores
else
:
prompts
=
list
(
zip
(
queries
,
corpus
))
outputs_tensor
=
HfRunner
.
predict
(
self
,
prompts
,
show_progress_bar
=
False
)
return
outputs_tensor
.
cpu
().
numpy
()
def
run_mteb_rerank
(
cross_encoder
:
mteb
.
CrossEncoderProtocol
,
tasks
,
languages
):
with
tempfile
.
TemporaryDirectory
()
as
prediction_folder
:
bm25s
=
mteb
.
get_model
(
"bm25s"
)
eval_splits
=
[
"test"
]
mteb_tasks
:
list
[
mteb
.
abstasks
.
AbsTaskRetrieval
]
=
mteb
.
get_tasks
(
tasks
=
tasks
,
languages
=
languages
,
eval_splits
=
eval_splits
)
mteb
.
evaluate
(
bm25s
,
mteb_tasks
,
prediction_folder
=
prediction_folder
,
show_progress_bar
=
False
,
# don't save results for test runs
cache
=
None
,
overwrite_strategy
=
"always"
,
)
second_stage_tasks
=
[]
for
task
in
mteb_tasks
:
second_stage_tasks
.
append
(
task
.
convert_to_reranking
(
prediction_folder
,
top_k
=
10
,
)
)
results
=
mteb
.
evaluate
(
cross_encoder
,
second_stage_tasks
,
show_progress_bar
=
False
,
cache
=
None
,
)
main_score
=
results
[
0
].
scores
[
"test"
][
0
][
"main_score"
]
return
main_score
def
mteb_test_rerank_models
(
vllm_runner
,
model_info
:
RerankModelInfo
,
hf_runner
=
HFMtebCrossEncoder
,
vllm_extra_kwargs
=
None
,
vllm_mteb_encoder
=
VllmMtebCrossEncoder
,
atol
=
MTEB_RERANK_TOL
,
):
vllm_extra_kwargs
=
get_vllm_extra_kwargs
(
model_info
,
vllm_extra_kwargs
)
# Maybe load chat_template.
chat_template
:
str
|
None
=
None
if
model_info
.
chat_template_name
is
not
None
:
chat_template
=
(
template_home
/
model_info
.
chat_template_name
).
read_text
()
with
vllm_runner
(
model_info
.
name
,
runner
=
"pooling"
,
max_model_len
=
None
,
max_num_seqs
=
8
,
**
vllm_extra_kwargs
,
)
as
vllm_model
:
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
vllm_model
.
chat_template
=
chat_template
# Confirm whether vllm is using the correct architecture
if
model_info
.
architecture
:
assert
model_info
.
architecture
in
model_config
.
architectures
# Score API is only enabled for num_labels == 1
assert
model_config
.
hf_config
.
num_labels
==
1
# Confirm whether the important configs in model_config are correct.
pooler_config
=
model_config
.
pooler_config
if
model_info
.
seq_pooling_type
is
not
None
:
assert
pooler_config
.
seq_pooling_type
==
model_info
.
seq_pooling_type
if
model_info
.
tok_pooling_type
is
not
None
:
assert
pooler_config
.
tok_pooling_type
==
model_info
.
tok_pooling_type
if
model_info
.
attn_type
is
not
None
:
assert
model_config
.
attn_type
==
model_info
.
attn_type
if
model_info
.
is_prefix_caching_supported
is
not
None
:
assert
(
model_config
.
is_prefix_caching_supported
==
model_info
.
is_prefix_caching_supported
)
if
model_info
.
is_chunked_prefill_supported
is
not
None
:
assert
(
model_config
.
is_chunked_prefill_supported
==
model_info
.
is_chunked_prefill_supported
)
vllm_main_score
=
run_mteb_rerank
(
vllm_mteb_encoder
(
vllm_model
),
tasks
=
MTEB_RERANK_TASKS
,
languages
=
MTEB_RERANK_LANGS
,
)
vllm_dtype
=
model_config
.
dtype
head_dtype
=
model_config
.
head_dtype
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if
model_info
.
mteb_score
is
None
:
with
hf_runner
(
model_info
.
name
,
dtype
=
model_info
.
hf_dtype
)
as
hf_model
:
hf_model
.
chat_template
=
chat_template
st_main_score
=
run_mteb_rerank
(
hf_model
,
tasks
=
MTEB_RERANK_TASKS
,
languages
=
MTEB_RERANK_LANGS
,
)
st_dtype
=
next
(
hf_model
.
model
.
model
.
parameters
()).
dtype
else
:
st_main_score
=
model_info
.
mteb_score
st_dtype
=
"Constant"
print
(
"Model:"
,
model_info
.
name
)
print
(
"VLLM:"
,
f
"dtype:
{
vllm_dtype
}
"
,
f
"head_dtype:
{
head_dtype
}
"
,
vllm_main_score
)
print
(
"SentenceTransformers:"
,
st_dtype
,
st_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert
st_main_score
-
vllm_main_score
<
atol
Prev
1
…
22
23
24
25
26
27
28
29
30
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment