Commit 711aa9d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.0' into v0.10.0-dev

parents 751c492c 6d8d0a24
...@@ -38,8 +38,8 @@ ERROR_CASES = [ ...@@ -38,8 +38,8 @@ ERROR_CASES = [
] ]
def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path): def test_peft_helper_pass(sql_lora_files, tmp_path):
peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1, peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
max_position_embeddings=4096) max_position_embeddings=4096)
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
peft_helper.validate_legal(lora_config) peft_helper.validate_legal(lora_config)
...@@ -56,15 +56,12 @@ def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path): ...@@ -56,15 +56,12 @@ def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
"embed_tokens", "embed_tokens",
"lm_head", "lm_head",
] ]
assert peft_helper.context_length == 16384
assert peft_helper.vllm_max_position_embeddings == 4096 assert peft_helper.vllm_max_position_embeddings == 4096
assert peft_helper.vllm_long_context_scaling_factor == float(
math.ceil(peft_helper.context_length /
peft_helper.vllm_max_position_embeddings))
# test RSLoRA # test RSLoRA
rslora_config = dict(use_rslora=True) rslora_config = dict(use_rslora=True)
test_dir = tmp_path / "test_rslora" test_dir = tmp_path / "test_rslora"
shutil.copytree(long_context_lora_files_16k_1, test_dir) shutil.copytree(sql_lora_files, test_dir)
# Load and modify configuration # Load and modify configuration
config_path = test_dir / "adapter_config.json" config_path = test_dir / "adapter_config.json"
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import pytest
import vllm import vllm
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
...@@ -51,9 +51,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -51,9 +51,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts return generated_texts
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
def test_phi2_lora(phi2_lora_files): def test_phi2_lora(phi2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI, # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM. # Otherwise, the lora-test will fail due to CUDA OOM.
......
...@@ -9,7 +9,7 @@ from vllm.platforms import current_platform ...@@ -9,7 +9,7 @@ from vllm.platforms import current_platform
from ..utils import create_new_process_for_each_test, multi_gpu_test from ..utils import create_new_process_for_each_test, multi_gpu_test
MODEL_PATH = "ArthurZ/ilama-3.2-1B" MODEL_PATH = "hmellor/Ilama-3.2-1B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import time
import os
import pytest import pytest
import ray import ray
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
import vllm.envs as envs import vllm.envs as envs
from vllm import EngineArgs, LLMEngine from vllm import EngineArgs, LLMEngine
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger from vllm.engine.metrics import RayPrometheusStatLogger
...@@ -48,7 +46,7 @@ def test_metric_counter_prompt_tokens( ...@@ -48,7 +46,7 @@ def test_metric_counter_prompt_tokens(
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model: gpu_memory_utilization=0.4) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
prompt_token_counts = [ prompt_token_counts = [
len(tokenizer.encode(p)) for p in example_prompts len(tokenizer.encode(p)) for p in example_prompts
] ]
...@@ -60,7 +58,7 @@ def test_metric_counter_prompt_tokens( ...@@ -60,7 +58,7 @@ def test_metric_counter_prompt_tokens(
vllm_prompt_token_count = sum(prompt_token_counts) vllm_prompt_token_count = sum(prompt_token_counts)
_ = vllm_model.generate_greedy(example_prompts, max_tokens) _ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_prompt_tokens.labels( metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
...@@ -84,8 +82,8 @@ def test_metric_counter_generation_tokens( ...@@ -84,8 +82,8 @@ def test_metric_counter_generation_tokens(
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model: gpu_memory_utilization=0.4) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels( metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
vllm_generation_count = 0 vllm_generation_count = 0
...@@ -120,8 +118,8 @@ def test_metric_counter_generation_tokens_multi_step( ...@@ -120,8 +118,8 @@ def test_metric_counter_generation_tokens_multi_step(
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels( metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
vllm_generation_count = 0 vllm_generation_count = 0
...@@ -152,7 +150,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, ...@@ -152,7 +150,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.3, gpu_memory_utilization=0.3,
served_model_name=served_model_name) as vllm_model: served_model_name=served_model_name) as vllm_model:
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metrics_tag_content = stat_logger.labels["model_name"] metrics_tag_content = stat_logger.labels["model_name"]
if envs.VLLM_CI_USE_S3: if envs.VLLM_CI_USE_S3:
...@@ -236,149 +234,6 @@ def test_engine_log_metrics_regression( ...@@ -236,149 +234,6 @@ def test_engine_log_metrics_regression(
assert_metrics(model, engine, disable_log_stats, len(example_prompts)) assert_metrics(model, engine, disable_log_stats, len(example_prompts))
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [10])
def test_metric_spec_decode(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
k = 5
with vllm_runner(
model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_config={
"model": model,
"num_speculative_tokens": k,
},
) as vllm_model:
# Force log interval to be 0 to catch all metrics.
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger.local_interval = 0
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn = {
"gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
"gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
"counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
"counter_spec_decode_num_draft_tokens": lambda v: v == k,
"counter_spec_decode_num_emitted_tokens":
lambda v: 0 <= v <= k + 1,
}
# Use one request to better inspect the metrics.
prompts = example_prompts[:1]
_ = vllm_model.generate_greedy(prompts, max_tokens)
for metric_name, is_expected in metric_name_to_expected_fn.items():
metric_val = getattr(
stat_logger.metrics,
metric_name).labels(**stat_logger.labels)._value.get()
assert is_expected(metric_val), (
f"the value of metric {metric_name} ({metric_val}) "
"does not meet expectation")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [10])
@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
def test_metric_spec_decode_interval(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
log_interval: int,
) -> None:
k = 5
engine_args = EngineArgs(
model=model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_config={
"model": model,
"num_speculative_tokens": k,
},
enforce_eager=True,
)
engine = LLMEngine.from_engine_args(engine_args)
try:
engine.add_request(
"request-id-0",
example_prompts[0],
SamplingParams(max_tokens=max_tokens),
)
# set log internal
stat_logger = engine.stat_loggers['prometheus']
stat_logger.local_interval = log_interval
# prefill
engine.step()
# wait for 5 seconds to ensure that spec decode metrics
# get triggered in first decode step
time.sleep(5)
# first decode step should trigger async collection of metrics
engine.step()
# wait one second to allow H2D transfer to finish
time.sleep(1)
# second decode step should now be able to collect the spec
# decode stats and the request should also be finished
engine.step()
# must have finisehd now
assert not engine.has_unfinished_requests()
# wait to ensure logging occurs
time.sleep(log_interval)
# force logging
engine.step()
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn = {
"gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
"gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
"counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
"counter_spec_decode_num_draft_tokens": lambda v: v == k,
"counter_spec_decode_num_emitted_tokens":
lambda v: 0 <= v <= k + 1,
}
for metric_name, is_expected in metric_name_to_expected_fn.items():
metric_val = getattr(
stat_logger.metrics,
metric_name).labels(**stat_logger.labels)._value.get()
assert is_expected(metric_val), (
f"the value of metric {metric_name} ({metric_val}) "
"does not meet expectation")
finally:
del engine
cleanup_dist_env_and_memory()
def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool, def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
num_requests: int) -> None: num_requests: int) -> None:
if disable_log_stats: if disable_log_stats:
......
...@@ -50,20 +50,15 @@ def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex, ...@@ -50,20 +50,15 @@ def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
whitespace_pattern=None, whitespace_pattern=None,
reasoner=None) reasoner=None)
token_ids = zephyr_7B_tokenzer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}")
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
regex_LP(token_ids, tensor) tensor = regex_LP([], tensor)
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor) assert not torch.allclose(tensor, original_tensor)
token_ids = zephyr_7B_tokenzer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}"
)
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
json_LP(token_ids, tensor) tensor = json_LP([], tensor)
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor) assert not torch.allclose(tensor, original_tensor)
...@@ -85,8 +80,6 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool, ...@@ -85,8 +80,6 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
seed=0, seed=0,
dtype="bfloat16", dtype="bfloat16",
) )
token_ids = zephyr_7B_tokenzer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}")
regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend) regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
regex_lp = get_local_guided_decoding_logits_processor( regex_lp = get_local_guided_decoding_logits_processor(
...@@ -96,13 +89,11 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool, ...@@ -96,13 +89,11 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
assert regex_lp is not None assert regex_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
tensor = regex_lp(token_ids, tensor) # allowed tokens at state 0
tensor = regex_lp([], tensor)
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor) assert not torch.allclose(tensor, original_tensor)
token_ids = zephyr_7B_tokenzer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}"
)
json_request = GuidedDecodingParams(json=sample_json_schema, json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend) backend=backend)
json_lp = await get_guided_decoding_logits_processor( json_lp = await get_guided_decoding_logits_processor(
...@@ -110,7 +101,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool, ...@@ -110,7 +101,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
assert json_lp is not None assert json_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor) tensor = json_lp([], tensor)
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor) assert not torch.allclose(tensor, original_tensor)
...@@ -134,7 +125,6 @@ async def test_guided_logits_processor_with_reasoning( ...@@ -134,7 +125,6 @@ async def test_guided_logits_processor_with_reasoning(
dtype="bfloat16", dtype="bfloat16",
) )
token_ids = deepseek_r1_qwen_tokenizer.encode( token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}."
"<think>here is the thinking process") "<think>here is the thinking process")
regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend) regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
...@@ -145,14 +135,13 @@ async def test_guided_logits_processor_with_reasoning( ...@@ -145,14 +135,13 @@ async def test_guided_logits_processor_with_reasoning(
regex_request, deepseek_r1_qwen_tokenizer, config, regex_request, deepseek_r1_qwen_tokenizer, config,
reasoning_backend) reasoning_backend)
assert regex_lp is not None assert regex_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(151664)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
tensor = regex_lp(token_ids, tensor) tensor = regex_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
assert torch.allclose(tensor, original_tensor) assert torch.allclose(tensor, original_tensor)
token_ids = deepseek_r1_qwen_tokenizer.encode( token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}."
"<think>here is the thinking process") "<think>here is the thinking process")
json_request = GuidedDecodingParams(json=sample_json_schema, json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend) backend=backend)
...@@ -162,7 +151,7 @@ async def test_guided_logits_processor_with_reasoning( ...@@ -162,7 +151,7 @@ async def test_guided_logits_processor_with_reasoning(
await get_guided_decoding_logits_processor( await get_guided_decoding_logits_processor(
json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend) json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
assert json_lp is not None assert json_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(151664)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor) tensor = json_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
...@@ -170,8 +159,7 @@ async def test_guided_logits_processor_with_reasoning( ...@@ -170,8 +159,7 @@ async def test_guided_logits_processor_with_reasoning(
# Thinking is over, so the tensor should change. # Thinking is over, so the tensor should change.
token_ids = deepseek_r1_qwen_tokenizer.encode( token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}." "<think>here is the thinking process</think>")
"<think>here is the thinking process</think> Then")
json_request = GuidedDecodingParams(json=sample_json_schema, json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend) backend=backend)
json_lp = get_local_guided_decoding_logits_processor( json_lp = get_local_guided_decoding_logits_processor(
...@@ -180,7 +168,7 @@ async def test_guided_logits_processor_with_reasoning( ...@@ -180,7 +168,7 @@ async def test_guided_logits_processor_with_reasoning(
await get_guided_decoding_logits_processor( await get_guided_decoding_logits_processor(
json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend) json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
assert json_lp is not None assert json_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(151664)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor) tensor = json_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
...@@ -205,19 +193,6 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex): ...@@ -205,19 +193,6 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
GuidedDecodingParams(json=sample_json_schema, grammar="test grammar") GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
def test_guided_decoding_backend_options():
"""Test backend-specific options"""
with pytest.warns(DeprecationWarning):
guided_decoding_params = GuidedDecodingParams(
backend=
"xgrammar:no-fallback,disable-any-whitespace,no-additional-properties"
)
assert guided_decoding_params.backend == "xgrammar"
assert guided_decoding_params.disable_fallback
assert guided_decoding_params.disable_any_whitespace
assert guided_decoding_params.disable_additional_properties
def test_pickle_xgrammar_tokenizer_data(): def test_pickle_xgrammar_tokenizer_data():
try: try:
import xgrammar as xgr import xgrammar as xgr
......
...@@ -5,7 +5,8 @@ import os ...@@ -5,7 +5,8 @@ import os
import pytest import pytest
from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType from vllm.model_executor.layers.pooler import (CLSPool, DispatchPooler,
MeanPool, PoolingType)
from vllm.model_executor.models.bert import BertEmbeddingModel from vllm.model_executor.models.bert import BertEmbeddingModel
from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.model_executor.models.roberta import RobertaEmbeddingModel
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -33,8 +34,8 @@ def test_model_loading_with_params(vllm_runner): ...@@ -33,8 +34,8 @@ def test_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that" output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n") " dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file # asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512 assert model_config.encoder_config["max_seq_length"] == 512
...@@ -50,7 +51,8 @@ def test_model_loading_with_params(vllm_runner): ...@@ -50,7 +51,8 @@ def test_model_loading_with_params(vllm_runner):
def check_model(model): def check_model(model):
assert isinstance(model, BertEmbeddingModel) assert isinstance(model, BertEmbeddingModel)
assert isinstance(model._pooler, CLSPool) assert isinstance(pooler := model.pooler, DispatchPooler)
assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool)
vllm_model.apply_model(check_model) vllm_model.apply_model(check_model)
...@@ -71,8 +73,8 @@ def test_roberta_model_loading_with_params(vllm_runner): ...@@ -71,8 +73,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that" output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n") " dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file # asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512 assert model_config.encoder_config["max_seq_length"] == 512
...@@ -88,7 +90,9 @@ def test_roberta_model_loading_with_params(vllm_runner): ...@@ -88,7 +90,9 @@ def test_roberta_model_loading_with_params(vllm_runner):
def check_model(model): def check_model(model):
assert isinstance(model, RobertaEmbeddingModel) assert isinstance(model, RobertaEmbeddingModel)
assert isinstance(model._pooler, MeanPool) assert isinstance(pooler := model.pooler, DispatchPooler)
assert isinstance(pooler.poolers_by_task["embed"].pooling,
MeanPool)
vllm_model.apply_model(check_model) vllm_model.apply_model(check_model)
...@@ -109,13 +113,14 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): ...@@ -109,13 +113,14 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that" output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n") " dreams for the first time.\n")
model_tokenizer = vllm_model.model.llm_engine.tokenizer model_tokenizer = vllm_model.llm.llm_engine.tokenizer
assert model_tokenizer.tokenizer_id == model_name assert model_tokenizer.tokenizer_id == model_name
def check_model(model): def check_model(model):
assert isinstance(model, RobertaEmbeddingModel) assert isinstance(model, RobertaEmbeddingModel)
assert not hasattr(model, "lm_head") assert not hasattr(model, "lm_head")
assert isinstance(model._pooler, CLSPool) assert isinstance(pooler := model.pooler, DispatchPooler)
assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool)
vllm_model.apply_model(check_model) vllm_model.apply_model(check_model)
......
...@@ -41,7 +41,7 @@ AITER_MODEL_LIST = [ ...@@ -41,7 +41,7 @@ AITER_MODEL_LIST = [
[ [
pytest.param( pytest.param(
os.path.join(models_path_prefix, "bigscience/bloom-560m"), # bloom - testing alibi slopes os.path.join(models_path_prefix, "bigscience/bloom-560m"), # bloom - testing alibi slopes
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model],
), ),
pytest.param( pytest.param(
os.path.join(models_path_prefix, "openai-community/gpt2"), # gpt2 os.path.join(models_path_prefix, "openai-community/gpt2"), # gpt2
...@@ -89,7 +89,11 @@ AITER_MODEL_LIST = [ ...@@ -89,7 +89,11 @@ AITER_MODEL_LIST = [
pytest.param(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")), # starcoder2 pytest.param(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")), # starcoder2
pytest.param( pytest.param(
os.path.join(models_path_prefix, "TitanML/tiny-mixtral"), # mixtral os.path.join(models_path_prefix, "TitanML/tiny-mixtral"), # mixtral
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model],
),
pytest.param(
os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924-Instruct"),
marks=[pytest.mark.cpu_model],
) )
]) ])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
......
...@@ -15,13 +15,13 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None: ...@@ -15,13 +15,13 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
load_format="dummy", load_format="dummy",
) as llm: ) as llm:
if model == "google/gemma-3-4b-it": if model == "google/gemma-3-4b-it":
normalizers = llm.model.collective_rpc( normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model. lambda self: self.model_runner.model.language_model.model.
normalizer.cpu().item()) normalizer.cpu().item())
config = llm.model.llm_engine.model_config.hf_config.text_config config = llm.llm.llm_engine.model_config.hf_config.text_config
else: else:
normalizers = llm.model.collective_rpc( normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.model.normalizer.cpu( lambda self: self.model_runner.model.model.normalizer.cpu(
).item()) ).item())
config = llm.model.llm_engine.model_config.hf_config config = llm.llm.llm_engine.model_config.hf_config
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3) assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
...@@ -63,13 +63,6 @@ V1_SUPPORTED_MODELS = [ ...@@ -63,13 +63,6 @@ V1_SUPPORTED_MODELS = [
os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"), os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"),
] ]
ATTN_BLOCK_SIZES = {
os.path.join(models_path_prefix,"ibm-ai-platform/Bamba-9B-v1"): 528,
os.path.join(models_path_prefix,"Zyphra/Zamba2-1.2B-instruct"): 80,
os.path.join(models_path_prefix,"nvidia/Nemotron-H-8B-Base-8K"): 528,
os.path.join(models_path_prefix,"ibm-granite/granite-4.0-tiny-preview"): 400,
os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"): 800,
}
# Avoid OOM # Avoid OOM
MAX_NUM_SEQS = 4 MAX_NUM_SEQS = 4
...@@ -107,11 +100,6 @@ def test_models( ...@@ -107,11 +100,6 @@ def test_models(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
if model in V1_SUPPORTED_MODELS: if model in V1_SUPPORTED_MODELS:
if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
block_size = ATTN_BLOCK_SIZES[model]
else:
block_size = 16
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1") m.setenv("VLLM_USE_V1", "1")
if model in HYBRID_MODELS: if model in HYBRID_MODELS:
...@@ -119,9 +107,7 @@ def test_models( ...@@ -119,9 +107,7 @@ def test_models(
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
with vllm_runner(model, with vllm_runner(model,
max_num_seqs=MAX_NUM_SEQS, max_num_seqs=MAX_NUM_SEQS,
enforce_eager=True, enable_prefix_caching=False) as vllm_model:
enable_prefix_caching=False,
block_size=block_size) as vllm_model:
vllm_v1_outputs = vllm_model.generate_greedy_logprobs( vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
else: else:
...@@ -291,7 +277,7 @@ def test_models_preemption_recompute( ...@@ -291,7 +277,7 @@ def test_models_preemption_recompute(
Tests that outputs are identical with and w/o preemptions (recompute). Tests that outputs are identical with and w/o preemptions (recompute).
""" """
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
scheduler = vllm_model.model.llm_engine.scheduler[0] scheduler = vllm_model.llm.llm_engine.scheduler[0]
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy( preempt_vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens) example_prompts, max_tokens)
......
...@@ -240,8 +240,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str, ...@@ -240,8 +240,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
load_format="mistral") as vllm_model: load_format="mistral") as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS: for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt} msg = {"role": "user", "content": prompt}
outputs = vllm_model.model.chat([msg], outputs = vllm_model.llm.chat([msg],
sampling_params=SAMPLING_PARAMS) sampling_params=SAMPLING_PARAMS)
assert "�" not in outputs[0].outputs[0].text.strip() assert "�" not in outputs[0].outputs[0].text.strip()
...@@ -255,11 +255,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None: ...@@ -255,11 +255,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
load_format="mistral") as vllm_model: load_format="mistral") as vllm_model:
msgs = copy.deepcopy(MSGS) msgs = copy.deepcopy(MSGS)
outputs = vllm_model.model.chat(msgs, outputs = vllm_model.llm.chat(msgs,
tools=TOOLS, tools=TOOLS,
sampling_params=SAMPLING_PARAMS) sampling_params=SAMPLING_PARAMS)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer) tool_parser = MistralToolParser(tokenizer)
model_output = outputs[0].outputs[0].text.strip() model_output = outputs[0].outputs[0].text.strip()
...@@ -310,7 +310,7 @@ def test_mistral_guided_decoding( ...@@ -310,7 +310,7 @@ def test_mistral_guided_decoding(
f"Give an example JSON for an employee profile that " f"Give an example JSON for an employee profile that "
f"fits this schema: {SAMPLE_JSON_SCHEMA}" f"fits this schema: {SAMPLE_JSON_SCHEMA}"
}] }]
outputs = vllm_model.model.chat(messages, sampling_params=params) outputs = vllm_model.llm.chat(messages, sampling_params=params)
generated_text = outputs[0].outputs[0].text generated_text = outputs[0].outputs[0].text
json_response = json.loads(generated_text) json_response = json.loads(generated_text)
......
...@@ -23,14 +23,14 @@ MTEB_EMBED_TOL = 1e-4 ...@@ -23,14 +23,14 @@ MTEB_EMBED_TOL = 1e-4
# See #19344 # See #19344
MTEB_RERANK_TASKS = ["NFCorpus"] MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"] MTEB_RERANK_LANGS = ["en"]
MTEB_RERANK_TOL = 1e-3 MTEB_RERANK_TOL = 2e-3
class VllmMtebEncoder(mteb.Encoder): class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model): def __init__(self, vllm_model):
super().__init__() super().__init__()
self.model = vllm_model self.llm = vllm_model
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def encode( def encode(
...@@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder): ...@@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
# issues by randomizing the order. # issues by randomizing the order.
r = self.rng.permutation(len(sentences)) r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r] sentences = [sentences[i] for i in r]
outputs = self.model.embed(sentences, use_tqdm=False) outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs) embeds = np.array(outputs)
embeds = embeds[np.argsort(r)] embeds = embeds[np.argsort(r)]
return embeds return embeds
...@@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder): ...@@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder):
queries = [s[0] for s in sentences] queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences] corpus = [s[1] for s in sentences]
outputs = self.model.score(queries, outputs = self.llm.score(queries,
corpus, corpus,
truncate_prompt_tokens=-1, truncate_prompt_tokens=-1,
use_tqdm=False) use_tqdm=False)
scores = np.array(outputs) scores = np.array(outputs)
scores = scores[np.argsort(r)] scores = scores[np.argsort(r)]
return scores return scores
...@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner, ...@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
if model_info.architecture: if model_info.architecture:
assert (model_info.architecture assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures) in vllm_model.llm.llm_engine.model_config.architectures)
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS) MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
with hf_runner(model_info.name, with hf_runner(model_info.name,
is_sentence_transformer=True, is_sentence_transformer=True,
...@@ -267,7 +267,9 @@ def mteb_test_rerank_models(hf_runner, ...@@ -267,7 +267,9 @@ def mteb_test_rerank_models(hf_runner,
vllm_runner, vllm_runner,
model_info: RerankModelInfo, model_info: RerankModelInfo,
vllm_extra_kwargs=None, vllm_extra_kwargs=None,
hf_model_callback=None): hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder,
atol=MTEB_RERANK_TOL):
if not model_info.enable_test: if not model_info.enable_test:
# A model family has many models with the same architecture, # A model family has many models with the same architecture,
# and we don't need to test each one. # and we don't need to test each one.
...@@ -282,13 +284,13 @@ def mteb_test_rerank_models(hf_runner, ...@@ -282,13 +284,13 @@ def mteb_test_rerank_models(hf_runner,
max_num_seqs=8, max_num_seqs=8,
**vllm_extra_kwargs) as vllm_model: **vllm_extra_kwargs) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
if model_info.architecture: if model_info.architecture:
assert (model_info.architecture in model_config.architectures) assert (model_info.architecture in model_config.architectures)
assert model_config.hf_config.num_labels == 1 assert model_config.hf_config.num_labels == 1
vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model), vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS, tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS) languages=MTEB_RERANK_LANGS)
vllm_dtype = model_config.dtype vllm_dtype = model_config.dtype
...@@ -300,4 +302,4 @@ def mteb_test_rerank_models(hf_runner, ...@@ -300,4 +302,4 @@ def mteb_test_rerank_models(hf_runner,
print("SentenceTransformers:", st_dtype, st_main_score) print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score) print("Difference:", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL) assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
...@@ -68,7 +68,6 @@ RERANK_MODELS = [ ...@@ -68,7 +68,6 @@ RERANK_MODELS = [
enable_test=False), enable_test=False),
RerankModelInfo("BAAI/bge-reranker-v2-m3", RerankModelInfo("BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
dtype="float32",
enable_test=False) enable_test=False)
] ]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Optional
import numpy as np
import pytest
import torch
from tests.conftest import HfRunner
from .mteb_utils import (RerankModelInfo, VllmMtebEncoder,
mteb_test_rerank_models)
RERANK_MODELS = [
RerankModelInfo("BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification"),
]
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
class GemmaRerankerHfRunner(HfRunner):
def __init__(self,
model_name: str,
dtype: str = "auto",
*args: Any,
**kwargs: Any) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
@torch.no_grad()
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
def get_inputs(pairs, tokenizer, prompt=None):
if prompt is None:
prompt = PROMPT
sep = "\n"
prompt_inputs = tokenizer(prompt,
return_tensors=None,
add_special_tokens=False)["input_ids"]
sep_inputs = tokenizer(sep,
return_tensors=None,
add_special_tokens=False)["input_ids"]
inputs = []
for query, passage in pairs:
query_inputs = tokenizer(
f"A: {query}",
return_tensors=None,
add_special_tokens=False,
truncation=True,
)
passage_inputs = tokenizer(
f"B: {passage}",
return_tensors=None,
add_special_tokens=False,
truncation=True,
)
item = tokenizer.prepare_for_model(
[tokenizer.bos_token_id] + query_inputs["input_ids"],
sep_inputs + passage_inputs["input_ids"],
truncation="only_second",
padding=False,
return_attention_mask=False,
return_token_type_ids=False,
add_special_tokens=False,
)
item["input_ids"] = item[
"input_ids"] + sep_inputs + prompt_inputs
item["attention_mask"] = [1] * len(item["input_ids"])
inputs.append(item)
return tokenizer.pad(
inputs,
padding=True,
return_tensors="pt",
)
scores = []
for query, doc, *_ in prompts:
pairs = [(query, doc)]
inputs = get_inputs(pairs, self.tokenizer)
inputs = inputs.to(self.model.device)
_n_tokens = inputs["input_ids"].shape[1]
logits = self.model(**inputs, return_dict=True).logits
_scores = (logits[:, -1,
self.yes_loc].view(-1, ).float().sigmoid())
scores.append(_scores[0].item())
return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.prompt = PROMPT
self.query_template = "A: {query}\n"
self.document_template = "B: {doc}\n{prompt}"
def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
_sentences = []
for query, corpus, prompt in sentences:
query = self.query_template.format(query=query)
corpus = self.document_template.format(doc=corpus, prompt=prompt)
_sentences.append((query, corpus, prompt))
return super().predict(_sentences, *args, **kwargs)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
monkeypatch) -> None:
monkeypatch.setenv("VLLM_USE_V1", "0")
assert model_info.architecture == "GemmaForSequenceClassification"
vllm_extra_kwargs: dict[str, Any] = {
"hf_overrides": {
"architectures": ["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"],
"method": "no_post_processing",
}
}
mteb_test_rerank_models(GemmaRerankerHfRunner,
vllm_runner,
model_info,
vllm_extra_kwargs,
vllm_mteb_encoder=GemmaMtebEncoder)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Optional from typing import Optional
import os
import pytest import pytest
from vllm.config import PoolerConfig from vllm.config import PoolerConfig
...@@ -31,8 +31,10 @@ def v1(run_with_both_engines): ...@@ -31,8 +31,10 @@ def v1(run_with_both_engines):
# [Decoder-only] # [Decoder-only]
pytest.param(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"), pytest.param(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
marks=[pytest.mark.core_model]), marks=[pytest.mark.core_model]),
pytest.param(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"), pytest.param(
marks=[pytest.mark.core_model, pytest.mark.cpu_model]), os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
# CPU v1 doesn't support sliding window
marks=[pytest.mark.core_model]),
# the qwen models interfere with each other (see PR # the qwen models interfere with each other (see PR
# https://github.com/vllm-project/vllm/pull/18720). # https://github.com/vllm-project/vllm/pull/18720).
# To avoid this problem, for now we skip v0 since it will be # To avoid this problem, for now we skip v0 since it will be
...@@ -40,11 +42,13 @@ def v1(run_with_both_engines): ...@@ -40,11 +42,13 @@ def v1(run_with_both_engines):
pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base"), pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base"),
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]), marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
# [Encoder-only] # [Encoder-only]
pytest.param(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"), pytest.param(
marks=[ os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
pytest.mark.core_model, pytest.mark.cpu_model, marks=[
pytest.mark.skip_v1 # CPU only supports V1
]), pytest.mark.core_model,
pytest.mark.skip_v1
]),
pytest.param(os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2"), pytest.param(os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2"),
marks=[pytest.mark.skip_v1]), marks=[pytest.mark.skip_v1]),
pytest.param(os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"), pytest.param(os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"),
...@@ -66,10 +70,6 @@ def test_models( ...@@ -66,10 +70,6 @@ def test_models(
model, model,
monkeypatch, monkeypatch,
) -> None: ) -> None:
if model == os.path.join(models_path_prefix,"intfloat/e5-mistral-7b-instruct") and current_platform.is_cpu(
) and os.environ.get("VLLM_USE_V1", "0") == "1":
pytest.skip("CPU V1 doesn't support sliding window")
if model == os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2") and current_platform.is_rocm(): if model == os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2") and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention # ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend # switch to use ROCm CK FA backend
......
...@@ -2,10 +2,9 @@ ...@@ -2,10 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations from __future__ import annotations
import importlib.util
from array import array
import os import os
import numpy as np
import openai import openai
import pytest import pytest
from scipy.spatial.distance import cosine from scipy.spatial.distance import cosine
...@@ -16,9 +15,6 @@ from vllm.config import ModelConfig ...@@ -16,9 +15,6 @@ from vllm.config import ModelConfig
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
from ....utils import models_path_prefix from ....utils import models_path_prefix
# GritLM embedding implementation is only supported by XFormers backend.
pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
reason="GritLM requires XFormers")
MODEL_NAME = os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm") MODEL_NAME = os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm")
MAX_MODEL_LEN = 4000 MAX_MODEL_LEN = 4000
...@@ -28,11 +24,11 @@ def _arr(arr): ...@@ -28,11 +24,11 @@ def _arr(arr):
""" """
Convert a list of integers to an array of integers. Convert a list of integers to an array of integers.
""" """
return array("i", arr) return np.array(arr)
def test_find_array(): def test_find_array():
from vllm.model_executor.models.gritlm import GritLMPooler from vllm.model_executor.models.gritlm import GritLMMeanPool
model_config = ModelConfig( model_config = ModelConfig(
MODEL_NAME, MODEL_NAME,
...@@ -43,17 +39,19 @@ def test_find_array(): ...@@ -43,17 +39,19 @@ def test_find_array():
dtype="bfloat16", dtype="bfloat16",
seed=0, seed=0,
) )
pooler = GritLMPooler(model_config=model_config) pooling = GritLMMeanPool(model_config=model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1 assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1 assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=3) == -1
assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=4) == 3
assert pooling._find_array(arr, _arr([3, 5]), start_idx=0) == -1
with pytest.raises(ValueError): with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1) pooling._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
def run_llm_encode( def run_llm_encode(
...@@ -126,7 +124,7 @@ def test_gritlm_offline_embedding(vllm_runner): ...@@ -126,7 +124,7 @@ def test_gritlm_offline_embedding(vllm_runner):
task="embed", task="embed",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
) as vllm_model: ) as vllm_model:
llm = vllm_model.model llm = vllm_model.llm
d_rep = run_llm_encode( d_rep = run_llm_encode(
llm, llm,
...@@ -173,7 +171,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): ...@@ -173,7 +171,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
task="generate", task="generate",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
) as vllm_model: ) as vllm_model:
llm = vllm_model.model llm = vllm_model.llm
sampling_params = SamplingParams(temperature=0.0, max_tokens=256) sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params) outputs = llm.generate(input, sampling_params=sampling_params)
......
...@@ -18,11 +18,8 @@ EMBEDDING_MODELS = [ ...@@ -18,11 +18,8 @@ EMBEDDING_MODELS = [
] ]
RERANK_MODELS = [ RERANK_MODELS = [
RerankModelInfo( RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
"jinaai/jina-reranker-v2-base-multilingual", architecture="XLMRobertaForSequenceClassification")
architecture="XLMRobertaForSequenceClassification",
dtype="float32",
)
] ]
...@@ -90,10 +87,10 @@ def test_matryoshka( ...@@ -90,10 +87,10 @@ def test_matryoshka(
task="embed", task="embed",
dtype=dtype, dtype=dtype,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
assert vllm_model.model.llm_engine.model_config.is_matryoshka assert vllm_model.llm.llm_engine.model_config.is_matryoshka
matryoshka_dimensions = ( matryoshka_dimensions = (
vllm_model.model.llm_engine.model_config.matryoshka_dimensions) vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
assert matryoshka_dimensions is not None assert matryoshka_dimensions is not None
if dimensions not in matryoshka_dimensions: if dimensions not in matryoshka_dimensions:
......
...@@ -12,11 +12,9 @@ from .mteb_utils import RerankModelInfo, mteb_test_rerank_models ...@@ -12,11 +12,9 @@ from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
RERANK_MODELS = [ RERANK_MODELS = [
RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification", architecture="Qwen2ForSequenceClassification",
dtype="float32",
enable_test=True), enable_test=True),
RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification", architecture="Qwen2ForSequenceClassification",
dtype="float32",
enable_test=False) enable_test=False)
] ]
......
...@@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor) ...@@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
def test_default(model_info, vllm_runner): def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, task="embed",
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
# For nomic-embed-text-v2-moe the length is set to 512 # For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json. # by sentence_bert_config.json.
...@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): ...@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512 # set max_model_len <= 512
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, task="embed",
max_model_len=256) as vllm_model: max_model_len=256) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256 assert model_config.max_model_len == 256
# set 512 < max_model_len <= 2048 # set 512 < max_model_len <= 2048
...@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): ...@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
else: else:
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, task="embed",
max_model_len=1024) as vllm_model: max_model_len=1024) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024 assert model_config.max_model_len == 1024
......
...@@ -6,17 +6,16 @@ import pytest ...@@ -6,17 +6,16 @@ import pytest
import torch import torch
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.utils import multi_gpu_test
from .mteb_utils import RerankModelInfo, mteb_test_rerank_models from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
RERANK_MODELS = [ RERANK_MODELS = [
RerankModelInfo("Qwen/Qwen3-Reranker-0.6B", RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
dtype="float32",
enable_test=True), enable_test=True),
RerankModelInfo("Qwen/Qwen3-Reranker-4B", RerankModelInfo("Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
dtype="float32",
enable_test=False) enable_test=False)
] ]
...@@ -89,3 +88,29 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: ...@@ -89,3 +88,29 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info, mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
vllm_extra_kwargs) vllm_extra_kwargs)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
@multi_gpu_test(num_gpus=2)
def test_rerank_models_mteb_tp(vllm_runner,
model_info: RerankModelInfo) -> None:
assert model_info.architecture == "Qwen3ForSequenceClassification"
vllm_extra_kwargs: dict[str, Any] = {
"hf_overrides": {
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
"tensor_parallel_size": 2,
}
if model_info.name == "Qwen/Qwen3-Reranker-4B":
vllm_extra_kwargs["max_num_seqs"] = 1
mteb_test_rerank_models(Qwen3RerankerHfRunner,
vllm_runner,
model_info,
vllm_extra_kwargs,
atol=1.2e-2)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest import pytest
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
...@@ -84,6 +86,9 @@ def test_prm_models( ...@@ -84,6 +86,9 @@ def test_prm_models(
dtype: str, dtype: str,
monkeypatch, monkeypatch,
) -> None: ) -> None:
if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
pytest.skip("CPU only supports V1")
if current_platform.is_rocm(): if current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention # ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend # switch to use ROCm CK FA backend
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment