Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -9,7 +9,7 @@ import vllm.lora.ops.torch_ops as torch_ops
import vllm.lora.ops.triton_ops as triton_ops
from vllm.lora.ops.triton_ops import LoRAKernelMeta
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
......@@ -395,7 +395,7 @@ def test_kernels(
Tests LoRA kernels.
"""
torch.set_default_device(device)
current_platform.seed_everything(seed)
set_random_seed(seed)
if op_type == "shrink":
check_lora_shrink_kernel(
......@@ -447,7 +447,7 @@ def test_kernels_hidden_size(
Tests SGMV and LoRA kernels.
"""
torch.set_default_device(device)
current_platform.seed_everything(seed)
set_random_seed(seed)
if op_type == "shrink":
check_lora_shrink_kernel(
......
......@@ -2,10 +2,12 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
import os
import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.sampling_params import BeamSearchParams
from ..utils import models_path_prefix
@dataclass
......@@ -14,9 +16,12 @@ class TestConfig:
lora_path: str
max_num_seqs: int = 2
max_loras: int = 2
max_lora_rank: int = 16
max_model_len: int = 4096
max_lora_rank: int = 32
enable_tower_connector_lora: bool = False
max_model_len: int = 8192
gpu_memory_utilization: float = 0.85
mm_processor_kwargs: dict[str, int] | None = None
mm_processor_cache_gb: float = 4
def __post_init__(self):
if self.mm_processor_kwargs is None:
......@@ -48,8 +53,11 @@ class Qwen2VLTester:
enable_lora=True,
max_loras=self.config.max_loras,
max_lora_rank=self.config.max_lora_rank,
enable_tower_connector_lora=self.config.enable_tower_connector_lora,
trust_remote_code=True,
gpu_memory_utilization=self.config.gpu_memory_utilization,
mm_processor_kwargs=self.config.mm_processor_kwargs,
mm_processor_cache_gb=self.config.mm_processor_cache_gb,
max_model_len=self.config.max_model_len,
)
......@@ -58,6 +66,7 @@ class Qwen2VLTester:
images: list[ImageAsset],
expected_outputs: list[str],
lora_id: int | None = None,
lora_name: str | None = None,
temperature: float = 0,
max_tokens: int = 5,
):
......@@ -73,10 +82,11 @@ class Qwen2VLTester:
for asset in images
]
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
lora_request = LoRARequest(
lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path
)
outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
generated_texts = [output.outputs[0].text.strip() for output in outputs]
# Validate outputs
for generated, expected in zip(generated_texts, expected_outputs):
assert expected.startswith(generated), (
......@@ -127,6 +137,22 @@ EXPECTED_OUTPUTS = [
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
]
EXPECTED_OUTPUTS_LANGUAGE = [
"A stop sign is shown in an Asian city, with buildings and a car in the "
"background.",
"The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees.",
]
EXPECTED_OUTPUTS_VISION = [
"A stop sign in front of oriental buildings.",
"A tree with pink flowers in front of it and a blue sky behind the flowers.",
]
EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
"A stop sign is located on the street of a Chinese neighborhood.",
"A closeup shot of the Tokyo Skytree with pink flowers in the foreground.",
]
# NOTE - beam search .text contains the whole text
EXPECTED_BEAM_SEARCH_OUTPUTS = [
[
......@@ -137,6 +163,7 @@ EXPECTED_BEAM_SEARCH_OUTPUTS = [
QWEN2VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
QWEN25VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")
QWEN3VL_MODEL_PATH = os.path.join(models_path_prefix, "Qwen/Qwen3-VL-4B-Instruct")
def test_qwen2vl_lora(qwen2vl_lora_files):
......@@ -175,3 +202,99 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
config = TestConfig(
model_path=QWEN25VL_MODEL_PATH,
lora_path=qwen25vl_vision_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id,
)
def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
config = TestConfig(
model_path=QWEN3VL_MODEL_PATH,
lora_path=qwen3vl_vision_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id,
)
def test_qwen2vl_multiple_lora_types(
qwen2vl_language_lora_files,
qwen2vl_vision_tower_connector_lora_files,
qwen2vl_vision_tower_lora_files,
):
"""
Test multiple LoRA adapter types (language, vision tower + connector,
vision tower only) using the same LLM instance to verify mm_encoder_cache
behavior with different LoRA requests.
By reusing the same LLM instance across different LoRA requests, we ensure that
the multimodal encoder cache correctly manages state transitions between
language-only and vision-enabled LoRA adapters.
"""
config = TestConfig(
model_path=QWEN2VL_MODEL_PATH,
# We'll override the lora_path for each specific test, but need to provide
# an initial path for initialization
lora_path=qwen2vl_language_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)
# Test 1: Language-only LoRA adapter
tester.config.lora_path = qwen2vl_language_lora_files
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_LANGUAGE,
lora_id=lora_id,
lora_name="language_only",
)
# Test 2: Vision tower + connector LoRA adapter
tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files
for lora_id in [3, 4]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_VISION,
lora_id=lora_id,
lora_name="vision_tower_connector",
)
# Test 3: Vision tower only LoRA adapter (no connector)
tester.config.lora_path = qwen2vl_vision_tower_lora_files
for lora_id in [5, 6]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
lora_id=lora_id,
lora_name="vision_tower",
)
......@@ -3,7 +3,7 @@
from collections import OrderedDict
from typing import NamedTuple
from unittest.mock import patch
from unittest.mock import MagicMock, patch
import pytest
from huggingface_hub.utils import HfHubHTTPError
......@@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error(
# Hugging Face model identifier with download error
path = "org/repo"
mock_exist.return_value = False
mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
mock_snapshot_download.side_effect = HfHubHTTPError(
"failed to query model info",
response=MagicMock(),
)
assert get_adapter_absolute_path(path) == path
......@@ -29,11 +29,7 @@ class RunaiDummyExecutor(UniProcExecutor):
is_driver_worker=is_driver_worker,
)
wrapper_kwargs = {
"vllm_config": self.vllm_config,
}
self.driver_worker = WorkerWrapperBase(**wrapper_kwargs)
self.driver_worker = WorkerWrapperBase()
self.collective_rpc("init_worker", args=([worker_rpc_kwargs],))
self.collective_rpc("init_device")
......@@ -67,7 +67,7 @@ def assert_from_collective_rpc(engine: LLM, closure: Callable, closure_kwargs: d
class DummyExecutor(UniProcExecutor):
def _init_executor(self) -> None:
"""Initialize the worker and load the model."""
self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, rpc_rank=0)
self.driver_worker = WorkerWrapperBase(rpc_rank=0)
distributed_init_method = get_distributed_init_method(get_ip(), get_open_port())
local_rank = 0
# set local rank as the device index if specified
......
......@@ -55,7 +55,7 @@ def test_get_draft_quant_config_without_draft_model():
@torch.inference_mode()
@pytest.mark.parametrize("device", DEVICES)
def test_fc_layer_quant_config_usage(dist_init, device) -> None:
def test_fc_layer_quant_config_usage(default_vllm_config, dist_init, device) -> None:
import torch
from vllm.model_executor.layers.linear import ReplicatedLinear
......
......@@ -5,12 +5,8 @@ import os
import pytest
from vllm.model_executor.layers.pooler import (
CLSPool,
DispatchPooler,
MeanPool,
PoolingType,
)
from vllm.model_executor.layers.pooler import DispatchPooler
from vllm.model_executor.layers.pooler.seqwise import CLSPool, MeanPool
from vllm.model_executor.models.bert import BertEmbeddingModel
from vllm.model_executor.models.roberta import RobertaEmbeddingModel
from vllm.platforms import current_platform
......@@ -51,8 +47,9 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
assert model_config.encoder_config["do_lower_case"]
# asserts on the pooling config files
assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
assert model_config.pooler_config.normalize
assert model_config.pooler_config.seq_pooling_type == "CLS"
assert model_config.pooler_config.tok_pooling_type == "ALL"
assert model_config.pooler_config.use_activation
# asserts on the tokenizer loaded
assert model_config.tokenizer == os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5")
......@@ -95,8 +92,9 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
assert not model_config.encoder_config["do_lower_case"]
# asserts on the pooling config files
assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
assert model_config.pooler_config.normalize
assert model_config.pooler_config.seq_pooling_type == "MEAN"
assert model_config.pooler_config.tok_pooling_type == "ALL"
assert model_config.pooler_config.use_activation
# asserts on the tokenizer loaded
assert model_config.tokenizer == os.path.join(models_path_prefix, "intfloat/multilingual-e5-base")
......
[[[0.0006361007690429688, 0.99951171875], [0.81884765625, 0.1812744140625], [0.025543212890625, 0.974609375], [0.0004382133483886719, 0.99951171875]]]
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM language generation tests."""
import warnings
import torch
from vllm.platforms import current_platform
def pytest_sessionstart(session):
"""Configure ROCm-specific settings before test session starts."""
if not current_platform.is_rocm():
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
warnings.warn(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues",
UserWarning,
stacklevel=1,
)
......@@ -12,6 +12,11 @@ from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
# Models that require embedding scaling for prompt_embeds test
EMBED_SCALING_MODELS = {
"openbmb/MiniCPM4.1-8B",
}
# This list contains the model that are using AITER kernel.
# Skip model that are not using AITER tests.
# When more AITER kernels are added, this list will not be
......@@ -66,8 +71,8 @@ AITER_MODEL_LIST = [
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"),
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=32)],
os.path.join(models_path_prefix, "openbmb/MiniCPM4.1-8B"), # minicpm
marks=[pytest.mark.core_model, large_gpu_mark(min_gb=48)],
),
pytest.param(
os.path.join(models_path_prefix, "facebook/opt-125m"), # opt
......@@ -137,16 +142,20 @@ def test_models(
prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None
prompt_token_ids = []
for prompt in example_prompts:
token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids.to(
hf_model.model.device
)
prompt_token_ids.append(token_ids)
if prompt_embeds is not None:
prompt_embeds.append(
hf_model.model.get_input_embeddings()(token_ids).squeeze(0)
)
embed = hf_model.model.get_input_embeddings()(token_ids)
# MiniCPM models apply scale_emb to embeddings internally.
# vLLM expects pre-scaled embeddings when using inputs_embeds.
if model in EMBED_SCALING_MODELS:
config = hf_model.model.config
embed = embed * config.scale_emb
prompt_embeds.append(embed.squeeze(0))
with vllm_runner(
model,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from ...utils import dummy_hf_overrides
MODELS = ["xai-org/grok-2"]
def _grok2_dummy_overrides(hf_config):
hf_config = dummy_hf_overrides(hf_config, model_arch="Grok1ForCausalLM")
text_config = hf_config.get_text_config()
text_config.update(
{
"hidden_size": 256,
"intermediate_size": 512,
"moe_intermediate_size": 256,
"num_attention_heads": 4,
"num_key_value_heads": 2,
"head_dim": 64,
}
)
return hf_config
@pytest.mark.parametrize("model", MODELS)
def test_dummy_generate(vllm_runner, monkeypatch, model: str) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(
model,
load_format="dummy",
max_model_len=128,
hf_overrides=_grok2_dummy_overrides,
enforce_eager=True,
) as llm:
prompt = "Hello from Grok-2"
tokenizer = llm.get_llm().get_tokenizer()
prompt_len = len(tokenizer.encode(prompt))
outputs = llm.generate_greedy([prompt], max_tokens=1)
output_ids, output_str = outputs[0]
assert len(output_ids) > prompt_len
assert output_str is not None
......@@ -62,6 +62,19 @@ def test_phimoe_routing_function():
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
# There is a known issue that triggers `AttributeError: 'DynamicCache'
# object has no attribute 'seen_tokens'` when running:
# `tests/models/language/generation/test_phimoe.py::test_models
# [5-64-bfloat16-microsoft/Phi-3.5-MoE-instruct]`
# This issue is being investigated and tracked in:
# https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58
# It is platform-agnostic. Therefore, we skip this test on all platforms for now.
@pytest.mark.skip(
reason="Skipping due to known issue: "
"'DynamicCache' object has no attribute 'seen_tokens'. See: "
"https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58 "
"for details.",
)
@pytest.mark.skipif(
condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM language generation tests."""
import warnings
import torch
from vllm.platforms import current_platform
def pytest_sessionstart(session):
"""Configure ROCm-specific settings before test session starts."""
if not current_platform.is_rocm():
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
torch.set_float32_matmul_precision("high")
warnings.warn(
"ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
"to avoid HuggingFace Transformers accuracy issues",
UserWarning,
stacklevel=1,
)
......@@ -61,7 +61,7 @@ def test_models(
vllm_extra_kwargs = {}
if model == (os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base"):
vllm_extra_kwargs["pooler_config"] = PoolerConfig(
pooling_type="MEAN", normalize=False
seq_pooling_type="MEAN", normalize=False
)
max_model_len: int | None = 512
......
......@@ -88,7 +88,7 @@ def test_gemma_multimodal(
convert="classify",
load_format="auto",
hf_overrides=update_config,
pooler_config=PoolerConfig(pooling_type="LAST"),
pooler_config=PoolerConfig(seq_pooling_type="LAST"),
max_model_len=512,
enforce_eager=True,
tensor_parallel_size=1,
......
......@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False),
pooler_config=PoolerConfig(use_activation=False),
) as vllm_model:
wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
......@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True),
pooler_config=PoolerConfig(use_activation=True),
) as vllm_model:
w_normalize = torch.tensor(vllm_model.embed(example_prompts))
......@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=False),
pooler_config=PoolerConfig(use_activation=False),
) as vllm_model:
wo_normalize = vllm_model.token_embed(example_prompts)
......@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model,
max_model_len=512,
dtype=dtype,
pooler_config=PoolerConfig(normalize=True),
pooler_config=PoolerConfig(use_activation=True),
) as vllm_model:
w_normalize = vllm_model.token_embed(example_prompts)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from typing import TYPE_CHECKING
import pytest
import torch
......@@ -9,7 +11,18 @@ from transformers import AutoModel
from vllm.platforms import current_platform
from ....conftest import HfRunner
from ...utils import check_transformers_version
from ....utils import VLLM_PATH
from ...registry import HF_EXAMPLE_MODELS
if TYPE_CHECKING:
from _typeshed import StrPath
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists()
FIXTURE_REWARD_RESULT = {
"Qwen/Qwen2.5-Math-PRM-7B": FIXTURES_PATH / "qwen2_5_math_prm_reward_step.json",
}
@pytest.fixture
......@@ -60,6 +73,16 @@ def step_reward_patch_hf_model(hf_model: HfRunner):
return hf_model
def dump_reward_outputs(outputs: list[list[float]], filename: "StrPath"):
with open(filename, "w", encoding="utf-8") as f:
json.dump(outputs, f)
def load_reward_outputs(filename: "StrPath") -> list[list[float]]:
with open(filename, encoding="utf-8") as f:
return json.load(f)
@pytest.mark.parametrize(
"model",
[
......@@ -77,9 +100,8 @@ def test_prm_models(
model: str,
dtype: str,
) -> None:
check_transformers_version(
"Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2"
)
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_transformers_version(on_fail="skip")
if current_platform.is_cpu():
pytest.skip("CPU only supports V1")
......@@ -91,9 +113,46 @@ def test_prm_models(
hf_model = step_reward_patch_hf_model(hf_model)
hf_outputs = hf_model.reward(math_step_prompts)
dump_reward_outputs(
hf_outputs,
FIXTURE_REWARD_RESULT[model],
)
# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).float()
vllm_output = torch.tensor(vllm_output).float()
assert torch.allclose(hf_output, vllm_output, 1.5e-2)
@pytest.mark.parametrize(
"model",
[
pytest.param(
"Qwen/Qwen2.5-Math-PRM-7B",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
],
)
@pytest.mark.parametrize("dtype", ["half"])
def test_prm_models_with_golden_outputs(
vllm_runner,
math_step_prompts,
model: str,
dtype: str,
) -> None:
if not FIXTURE_REWARD_RESULT.get(model):
pytest.skip(f"No available golden outputs for {model}.")
with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.reward(math_step_prompts)
golden_outputs = load_reward_outputs(FIXTURE_REWARD_RESULT[model])
# check logits difference
for golden_output, vllm_output in zip(golden_outputs, vllm_outputs):
golden_output = torch.tensor(golden_output).float()
vllm_output = torch.tensor(vllm_output).float()
assert torch.allclose(golden_output, vllm_output, 1.5e-2)
......@@ -5,6 +5,7 @@ import torch
from transformers import AutoModelForTokenClassification
from tests.models.utils import softmax
from vllm.platforms import current_platform
@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
......@@ -21,8 +22,17 @@ def test_bert_models(
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts)
# Use eager attention on ROCm to avoid HF Transformers flash attention
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
hf_model_kwargs = {}
if current_platform.is_rocm():
hf_model_kwargs["attn_implementation"] = "eager"
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
model,
dtype=dtype,
auto_cls=AutoModelForTokenClassification,
model_kwargs=hf_model_kwargs,
) as hf_model:
tokenizer = hf_model.tokenizer
hf_outputs = []
......@@ -34,9 +44,9 @@ def test_bert_models(
# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).cpu().float()
vllm_output = torch.tensor(vllm_output).cpu().float()
assert torch.allclose(hf_output, vllm_output, 1e-2)
hf_output = hf_output.detach().clone().cpu().float()
vllm_output = vllm_output.detach().clone().cpu().float()
torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)
@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
......@@ -52,8 +62,17 @@ def test_modernbert_models(
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts)
# Use eager attention on ROCm to avoid HF Transformers flash attention
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
hf_model_kwargs = {}
if current_platform.is_rocm():
hf_model_kwargs["attn_implementation"] = "eager"
with hf_runner(
model, dtype=dtype, auto_cls=AutoModelForTokenClassification
model,
dtype=dtype,
auto_cls=AutoModelForTokenClassification,
model_kwargs=hf_model_kwargs,
) as hf_model:
tokenizer = hf_model.tokenizer
hf_outputs = []
......@@ -65,9 +84,9 @@ def test_modernbert_models(
# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).cpu().float()
vllm_output = torch.tensor(vllm_output).cpu().float()
assert torch.allclose(hf_output, vllm_output, atol=1e-2)
hf_output = hf_output.detach().clone().cpu().float()
vllm_output = vllm_output.detach().clone().cpu().float()
torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3)
@pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"])
......@@ -96,6 +115,6 @@ def test_auto_conversion(
# check logits difference
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = torch.tensor(hf_output).cpu().float()
vllm_output = torch.tensor(vllm_output).cpu().float()
hf_output = hf_output.detach().clone().cpu().float()
vllm_output = vllm_output.detach().clone().cpu().float()
assert torch.allclose(hf_output, vllm_output, atol=1e-2)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
import mteb
import numpy as np
import requests
import torch
from mteb.models import ModelMeta
from mteb.types import Array
......@@ -14,7 +11,6 @@ from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs
from tests.models.utils import (
EmbedModelInfo,
RerankModelInfo,
check_embeddings_close,
get_vllm_extra_kwargs,
)
......@@ -23,14 +19,10 @@ from tests.models.utils import (
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
# 5e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4
MTEB_EMBED_TOL = 5e-4
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3
_empty_model_meta = ModelMeta(
loader=None,
......@@ -54,29 +46,9 @@ _empty_model_meta = ModelMeta(
)
class VllmMtebEncoder(mteb.EncoderProtocol):
class MtebEmbedMixin(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
def similarity(
self,
embeddings1: np.ndarray,
......@@ -102,31 +74,29 @@ class VllmMtebEncoder(mteb.EncoderProtocol):
return sim
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebEncoder(MtebEmbedMixin):
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def predict(
def encode(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
return scores
# Hoping to discover potential scheduling
# issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
class OpenAIClientMtebEncoder(VllmMtebEncoder):
class OpenAIClientMtebEncoder(MtebEmbedMixin):
def __init__(self, model_name: str, client):
self.model_name = model_name
self.client = client
......@@ -153,58 +123,6 @@ class OpenAIClientMtebEncoder(VllmMtebEncoder):
return embeds
class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url):
self.model_name = model_name
self.url = url
self.rng = np.random.default_rng(seed=42)
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
full_corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = []
for query, corpus in zip(queries, full_corpus):
outputs.append(self.get_score(query, corpus))
scores = np.array(outputs)
return scores
def get_score(self, query, corpus):
response = requests.post(
self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
},
).json()
return response["data"][0]["score"]
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
def get_score(self, query, corpus):
response = requests.post(
self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
},
).json()
return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks)
results = mteb.evaluate(
......@@ -243,12 +161,24 @@ def mteb_test_embed_models(
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
# Confirm whether the important configs in model_config are correct.
pooler_config = model_config.pooler_config
if model_info.seq_pooling_type is not None:
assert pooler_config.seq_pooling_type == model_info.seq_pooling_type
if model_info.tok_pooling_type is not None:
assert pooler_config.tok_pooling_type == model_info.tok_pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_embed_task(
VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
......@@ -299,117 +229,3 @@ def mteb_test_embed_models(
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s")
eval_splits = ["test"]
mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
tasks=tasks, languages=languages, eval_splits=eval_splits
)
mteb.evaluate(
bm25s,
mteb_tasks,
prediction_folder=prediction_folder,
show_progress_bar=False,
# don't save results for test runs
cache=None,
overwrite_strategy="always",
)
second_stage_tasks = []
for task in mteb_tasks:
second_stage_tasks.append(
task.convert_to_reranking(
prediction_folder,
top_k=10,
)
)
results = mteb.evaluate(
cross_encoder,
second_stage_tasks,
show_progress_bar=False,
cache=None,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_rerank_models_hf(
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
):
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_rerank(
hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS
)
st_dtype = next(hf_model.model.model.parameters()).dtype
return st_main_score, st_dtype
def mteb_test_rerank_models(
hf_runner,
vllm_runner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebCrossEncoder,
atol=MTEB_RERANK_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
max_num_seqs=8,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (
model_config._model_info.default_pooling_type
== model_info.default_pooling_type
)
vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS,
)
vllm_dtype = model_config.dtype
head_dtype = model_config.head_dtype
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
st_main_score, st_dtype = mteb_test_rerank_models_hf(
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from pathlib import Path
from typing import Any
import mteb
import numpy as np
import requests
import torch
from mteb.models import ModelMeta
from torch.utils.data import DataLoader
from tests.conftest import HfRunner
from tests.models.utils import (
RerankModelInfo,
get_vllm_extra_kwargs,
)
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3
template_home = (
Path(__file__).parent.parent.parent.parent.parent
/ "examples/pooling/score/template"
)
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
revision="1",
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=["text"], # 'image' can be added to evaluate multimodal models
)
class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
self.chat_template: str | None = getattr(vllm_model, "chat_template", None)
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
# Hoping to discover potential scheduling
# issues by randomizing the order.
r = self.rng.permutation(len(queries))
queries = [queries[i] for i in r]
corpus = [corpus[i] for i in r]
outputs = self.llm.score(
queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False,
chat_template=self.chat_template,
)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores
class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url):
self.model_name = model_name
self.url = url
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
full_corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = []
for query, corpus in zip(queries, full_corpus):
outputs.append(self.get_score(query, corpus))
scores = np.array(outputs)
return scores
def get_score(self, query, corpus):
response = requests.post(
self.url,
json={
"model": self.model_name,
"text_1": query,
"text_2": corpus,
"truncate_prompt_tokens": -1,
},
).json()
return response["data"][0]["score"]
class RerankClientMtebEncoder(ScoreClientMtebEncoder):
def get_score(self, query, corpus):
response = requests.post(
self.url,
json={
"model": self.model_name,
"query": query,
"documents": [corpus],
"truncate_prompt_tokens": -1,
},
).json()
return response["results"][0]["relevance_score"]
class HFMtebCrossEncoder(MtebCrossEncoderMixin, HfRunner):
chat_template: str | None = None
def __init__(self, model_name: str, dtype: str = "auto", **kwargs: Any) -> None:
HfRunner.__init__(
self, model_name=model_name, is_cross_encoder=True, dtype=dtype, **kwargs
)
@torch.no_grad
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
if self.chat_template is not None:
tokenizer = self.model.tokenizer
prompts = []
for query, document in zip(queries, corpus):
conversation = [
{"role": "query", "content": query},
{"role": "document", "content": document},
]
prompt = tokenizer.apply_chat_template(
conversation=conversation,
tools=None,
chat_template=self.chat_template,
tokenize=False,
)
prompts.append(prompt)
outputs_list = HfRunner.classify(self, prompts)
scores = np.array(outputs_list).squeeze(-1)
return scores
else:
prompts = list(zip(queries, corpus))
outputs_tensor = HfRunner.predict(self, prompts, show_progress_bar=False)
return outputs_tensor.cpu().numpy()
def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s")
eval_splits = ["test"]
mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
tasks=tasks, languages=languages, eval_splits=eval_splits
)
mteb.evaluate(
bm25s,
mteb_tasks,
prediction_folder=prediction_folder,
show_progress_bar=False,
# don't save results for test runs
cache=None,
overwrite_strategy="always",
)
second_stage_tasks = []
for task in mteb_tasks:
second_stage_tasks.append(
task.convert_to_reranking(
prediction_folder,
top_k=10,
)
)
results = mteb.evaluate(
cross_encoder,
second_stage_tasks,
show_progress_bar=False,
cache=None,
)
main_score = results[0].scores["test"][0]["main_score"]
return main_score
def mteb_test_rerank_models(
vllm_runner,
model_info: RerankModelInfo,
hf_runner=HFMtebCrossEncoder,
vllm_extra_kwargs=None,
vllm_mteb_encoder=VllmMtebCrossEncoder,
atol=MTEB_RERANK_TOL,
):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
# Maybe load chat_template.
chat_template: str | None = None
if model_info.chat_template_name is not None:
chat_template = (template_home / model_info.chat_template_name).read_text()
with vllm_runner(
model_info.name,
runner="pooling",
max_model_len=None,
max_num_seqs=8,
**vllm_extra_kwargs,
) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
vllm_model.chat_template = chat_template
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1
# Confirm whether the important configs in model_config are correct.
pooler_config = model_config.pooler_config
if model_info.seq_pooling_type is not None:
assert pooler_config.seq_pooling_type == model_info.seq_pooling_type
if model_info.tok_pooling_type is not None:
assert pooler_config.tok_pooling_type == model_info.tok_pooling_type
if model_info.attn_type is not None:
assert model_config.attn_type == model_info.attn_type
if model_info.is_prefix_caching_supported is not None:
assert (
model_config.is_prefix_caching_supported
== model_info.is_prefix_caching_supported
)
if model_info.is_chunked_prefill_supported is not None:
assert (
model_config.is_chunked_prefill_supported
== model_info.is_chunked_prefill_supported
)
vllm_main_score = run_mteb_rerank(
vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS,
)
vllm_dtype = model_config.dtype
head_dtype = model_config.head_dtype
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(model_info.name, dtype=model_info.hf_dtype) as hf_model:
hf_model.chat_template = chat_template
st_main_score = run_mteb_rerank(
hf_model,
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS,
)
st_dtype = next(hf_model.model.model.parameters()).dtype
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment