Commit 711aa9d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.0' into v0.10.0-dev

parents 751c492c 6d8d0a24
......@@ -38,8 +38,8 @@ ERROR_CASES = [
]
def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1,
def test_peft_helper_pass(sql_lora_files, tmp_path):
peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
max_position_embeddings=4096)
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
peft_helper.validate_legal(lora_config)
......@@ -56,15 +56,12 @@ def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
"embed_tokens",
"lm_head",
]
assert peft_helper.context_length == 16384
assert peft_helper.vllm_max_position_embeddings == 4096
assert peft_helper.vllm_long_context_scaling_factor == float(
math.ceil(peft_helper.context_length /
peft_helper.vllm_max_position_embeddings))
# test RSLoRA
rslora_config = dict(use_rslora=True)
test_dir = tmp_path / "test_rslora"
shutil.copytree(long_context_lora_files_16k_1, test_dir)
shutil.copytree(sql_lora_files, test_dir)
# Load and modify configuration
config_path = test_dir / "adapter_config.json"
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
import vllm
from vllm.lora.request import LoRARequest
......@@ -51,9 +51,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
def test_phi2_lora(phi2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
......
......@@ -9,7 +9,7 @@ from vllm.platforms import current_platform
from ..utils import create_new_process_for_each_test, multi_gpu_test
MODEL_PATH = "ArthurZ/ilama-3.2-1B"
MODEL_PATH = "hmellor/Ilama-3.2-1B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import time
import os
import pytest
import ray
from prometheus_client import REGISTRY
import vllm.envs as envs
from vllm import EngineArgs, LLMEngine
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger
......@@ -48,7 +46,7 @@ def test_metric_counter_prompt_tokens(
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
prompt_token_counts = [
len(tokenizer.encode(p)) for p in example_prompts
]
......@@ -60,7 +58,7 @@ def test_metric_counter_prompt_tokens(
vllm_prompt_token_count = sum(prompt_token_counts)
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get()
......@@ -84,8 +82,8 @@ def test_metric_counter_generation_tokens(
disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
......@@ -120,8 +118,8 @@ def test_metric_counter_generation_tokens_multi_step(
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
......@@ -152,7 +150,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
disable_log_stats=False,
gpu_memory_utilization=0.3,
served_model_name=served_model_name) as vllm_model:
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metrics_tag_content = stat_logger.labels["model_name"]
if envs.VLLM_CI_USE_S3:
......@@ -236,149 +234,6 @@ def test_engine_log_metrics_regression(
assert_metrics(model, engine, disable_log_stats, len(example_prompts))
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [10])
def test_metric_spec_decode(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
k = 5
with vllm_runner(
model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_config={
"model": model,
"num_speculative_tokens": k,
},
) as vllm_model:
# Force log interval to be 0 to catch all metrics.
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger.local_interval = 0
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn = {
"gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
"gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
"counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
"counter_spec_decode_num_draft_tokens": lambda v: v == k,
"counter_spec_decode_num_emitted_tokens":
lambda v: 0 <= v <= k + 1,
}
# Use one request to better inspect the metrics.
prompts = example_prompts[:1]
_ = vllm_model.generate_greedy(prompts, max_tokens)
for metric_name, is_expected in metric_name_to_expected_fn.items():
metric_val = getattr(
stat_logger.metrics,
metric_name).labels(**stat_logger.labels)._value.get()
assert is_expected(metric_val), (
f"the value of metric {metric_name} ({metric_val}) "
"does not meet expectation")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [10])
@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
def test_metric_spec_decode_interval(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
log_interval: int,
) -> None:
k = 5
engine_args = EngineArgs(
model=model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_config={
"model": model,
"num_speculative_tokens": k,
},
enforce_eager=True,
)
engine = LLMEngine.from_engine_args(engine_args)
try:
engine.add_request(
"request-id-0",
example_prompts[0],
SamplingParams(max_tokens=max_tokens),
)
# set log internal
stat_logger = engine.stat_loggers['prometheus']
stat_logger.local_interval = log_interval
# prefill
engine.step()
# wait for 5 seconds to ensure that spec decode metrics
# get triggered in first decode step
time.sleep(5)
# first decode step should trigger async collection of metrics
engine.step()
# wait one second to allow H2D transfer to finish
time.sleep(1)
# second decode step should now be able to collect the spec
# decode stats and the request should also be finished
engine.step()
# must have finisehd now
assert not engine.has_unfinished_requests()
# wait to ensure logging occurs
time.sleep(log_interval)
# force logging
engine.step()
# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn = {
"gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
"gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
"counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
"counter_spec_decode_num_draft_tokens": lambda v: v == k,
"counter_spec_decode_num_emitted_tokens":
lambda v: 0 <= v <= k + 1,
}
for metric_name, is_expected in metric_name_to_expected_fn.items():
metric_val = getattr(
stat_logger.metrics,
metric_name).labels(**stat_logger.labels)._value.get()
assert is_expected(metric_val), (
f"the value of metric {metric_name} ({metric_val}) "
"does not meet expectation")
finally:
del engine
cleanup_dist_env_and_memory()
def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
num_requests: int) -> None:
if disable_log_stats:
......
......@@ -50,20 +50,15 @@ def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
whitespace_pattern=None,
reasoner=None)
token_ids = zephyr_7B_tokenzer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}")
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
regex_LP(token_ids, tensor)
tensor = regex_LP([], tensor)
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
token_ids = zephyr_7B_tokenzer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}"
)
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
json_LP(token_ids, tensor)
tensor = json_LP([], tensor)
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
......@@ -85,8 +80,6 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
seed=0,
dtype="bfloat16",
)
token_ids = zephyr_7B_tokenzer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}")
regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
regex_lp = get_local_guided_decoding_logits_processor(
......@@ -96,13 +89,11 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
assert regex_lp is not None
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
tensor = regex_lp(token_ids, tensor)
# allowed tokens at state 0
tensor = regex_lp([], tensor)
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
token_ids = zephyr_7B_tokenzer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}"
)
json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend)
json_lp = await get_guided_decoding_logits_processor(
......@@ -110,7 +101,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
assert json_lp is not None
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor)
tensor = json_lp([], tensor)
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
......@@ -134,7 +125,6 @@ async def test_guided_logits_processor_with_reasoning(
dtype="bfloat16",
)
token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}."
"<think>here is the thinking process")
regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
......@@ -145,14 +135,13 @@ async def test_guided_logits_processor_with_reasoning(
regex_request, deepseek_r1_qwen_tokenizer, config,
reasoning_backend)
assert regex_lp is not None
tensor = torch.rand(32000)
tensor = torch.rand(151664)
original_tensor = torch.clone(tensor)
tensor = regex_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape
assert torch.allclose(tensor, original_tensor)
token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}."
"<think>here is the thinking process")
json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend)
......@@ -162,7 +151,7 @@ async def test_guided_logits_processor_with_reasoning(
await get_guided_decoding_logits_processor(
json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
assert json_lp is not None
tensor = torch.rand(32000)
tensor = torch.rand(151664)
original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape
......@@ -170,8 +159,7 @@ async def test_guided_logits_processor_with_reasoning(
# Thinking is over, so the tensor should change.
token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}."
"<think>here is the thinking process</think> Then")
"<think>here is the thinking process</think>")
json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend)
json_lp = get_local_guided_decoding_logits_processor(
......@@ -180,7 +168,7 @@ async def test_guided_logits_processor_with_reasoning(
await get_guided_decoding_logits_processor(
json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
assert json_lp is not None
tensor = torch.rand(32000)
tensor = torch.rand(151664)
original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape
......@@ -205,19 +193,6 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
def test_guided_decoding_backend_options():
"""Test backend-specific options"""
with pytest.warns(DeprecationWarning):
guided_decoding_params = GuidedDecodingParams(
backend=
"xgrammar:no-fallback,disable-any-whitespace,no-additional-properties"
)
assert guided_decoding_params.backend == "xgrammar"
assert guided_decoding_params.disable_fallback
assert guided_decoding_params.disable_any_whitespace
assert guided_decoding_params.disable_additional_properties
def test_pickle_xgrammar_tokenizer_data():
try:
import xgrammar as xgr
......
......@@ -5,7 +5,8 @@ import os
import pytest
from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType
from vllm.model_executor.layers.pooler import (CLSPool, DispatchPooler,
MeanPool, PoolingType)
from vllm.model_executor.models.bert import BertEmbeddingModel
from vllm.model_executor.models.roberta import RobertaEmbeddingModel
from vllm.platforms import current_platform
......@@ -33,8 +34,8 @@ def test_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
......@@ -50,7 +51,8 @@ def test_model_loading_with_params(vllm_runner):
def check_model(model):
assert isinstance(model, BertEmbeddingModel)
assert isinstance(model._pooler, CLSPool)
assert isinstance(pooler := model.pooler, DispatchPooler)
assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool)
vllm_model.apply_model(check_model)
......@@ -71,8 +73,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
......@@ -88,7 +90,9 @@ def test_roberta_model_loading_with_params(vllm_runner):
def check_model(model):
assert isinstance(model, RobertaEmbeddingModel)
assert isinstance(model._pooler, MeanPool)
assert isinstance(pooler := model.pooler, DispatchPooler)
assert isinstance(pooler.poolers_by_task["embed"].pooling,
MeanPool)
vllm_model.apply_model(check_model)
......@@ -109,13 +113,14 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
assert model_tokenizer.tokenizer_id == model_name
def check_model(model):
assert isinstance(model, RobertaEmbeddingModel)
assert not hasattr(model, "lm_head")
assert isinstance(model._pooler, CLSPool)
assert isinstance(pooler := model.pooler, DispatchPooler)
assert isinstance(pooler.poolers_by_task["embed"].pooling, CLSPool)
vllm_model.apply_model(check_model)
......
......@@ -41,7 +41,7 @@ AITER_MODEL_LIST = [
[
pytest.param(
os.path.join(models_path_prefix, "bigscience/bloom-560m"), # bloom - testing alibi slopes
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
marks=[pytest.mark.core_model],
),
pytest.param(
os.path.join(models_path_prefix, "openai-community/gpt2"), # gpt2
......@@ -89,7 +89,11 @@ AITER_MODEL_LIST = [
pytest.param(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")), # starcoder2
pytest.param(
os.path.join(models_path_prefix, "TitanML/tiny-mixtral"), # mixtral
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
marks=[pytest.mark.core_model],
),
pytest.param(
os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924-Instruct"),
marks=[pytest.mark.cpu_model],
)
])
@pytest.mark.parametrize("max_tokens", [32])
......
......@@ -15,13 +15,13 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
load_format="dummy",
) as llm:
if model == "google/gemma-3-4b-it":
normalizers = llm.model.collective_rpc(
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model.
normalizer.cpu().item())
config = llm.model.llm_engine.model_config.hf_config.text_config
config = llm.llm.llm_engine.model_config.hf_config.text_config
else:
normalizers = llm.model.collective_rpc(
normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.model.normalizer.cpu(
).item())
config = llm.model.llm_engine.model_config.hf_config
config = llm.llm.llm_engine.model_config.hf_config
assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
......@@ -63,13 +63,6 @@ V1_SUPPORTED_MODELS = [
os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"),
]
ATTN_BLOCK_SIZES = {
os.path.join(models_path_prefix,"ibm-ai-platform/Bamba-9B-v1"): 528,
os.path.join(models_path_prefix,"Zyphra/Zamba2-1.2B-instruct"): 80,
os.path.join(models_path_prefix,"nvidia/Nemotron-H-8B-Base-8K"): 528,
os.path.join(models_path_prefix,"ibm-granite/granite-4.0-tiny-preview"): 400,
os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"): 800,
}
# Avoid OOM
MAX_NUM_SEQS = 4
......@@ -107,11 +100,6 @@ def test_models(
example_prompts, max_tokens, num_logprobs)
if model in V1_SUPPORTED_MODELS:
if model in HYBRID_MODELS and model in ATTN_BLOCK_SIZES:
block_size = ATTN_BLOCK_SIZES[model]
else:
block_size = 16
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
if model in HYBRID_MODELS:
......@@ -119,9 +107,7 @@ def test_models(
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
with vllm_runner(model,
max_num_seqs=MAX_NUM_SEQS,
enforce_eager=True,
enable_prefix_caching=False,
block_size=block_size) as vllm_model:
enable_prefix_caching=False) as vllm_model:
vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
else:
......@@ -291,7 +277,7 @@ def test_models_preemption_recompute(
Tests that outputs are identical with and w/o preemptions (recompute).
"""
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
scheduler = vllm_model.model.llm_engine.scheduler[0]
scheduler = vllm_model.llm.llm_engine.scheduler[0]
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)
......
......@@ -240,8 +240,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
load_format="mistral") as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.model.chat([msg],
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat([msg],
sampling_params=SAMPLING_PARAMS)
assert "�" not in outputs[0].outputs[0].text.strip()
......@@ -255,11 +255,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
load_format="mistral") as vllm_model:
msgs = copy.deepcopy(MSGS)
outputs = vllm_model.model.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer)
model_output = outputs[0].outputs[0].text.strip()
......@@ -310,7 +310,7 @@ def test_mistral_guided_decoding(
f"Give an example JSON for an employee profile that "
f"fits this schema: {SAMPLE_JSON_SCHEMA}"
}]
outputs = vllm_model.model.chat(messages, sampling_params=params)
outputs = vllm_model.llm.chat(messages, sampling_params=params)
generated_text = outputs[0].outputs[0].text
json_response = json.loads(generated_text)
......
......@@ -23,14 +23,14 @@ MTEB_EMBED_TOL = 1e-4
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
MTEB_RERANK_TOL = 1e-3
MTEB_RERANK_TOL = 2e-3
class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model):
super().__init__()
self.model = vllm_model
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
......@@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
# issues by randomizing the order.
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.model.embed(sentences, use_tqdm=False)
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
......@@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder):
queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]
outputs = self.model.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
outputs = self.llm.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores
......@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
if model_info.architecture:
assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures)
in vllm_model.llm.llm_engine.model_config.architectures)
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
with hf_runner(model_info.name,
is_sentence_transformer=True,
......@@ -267,7 +267,9 @@ def mteb_test_rerank_models(hf_runner,
vllm_runner,
model_info: RerankModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None):
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder,
atol=MTEB_RERANK_TOL):
if not model_info.enable_test:
# A model family has many models with the same architecture,
# and we don't need to test each one.
......@@ -282,13 +284,13 @@ def mteb_test_rerank_models(hf_runner,
max_num_seqs=8,
**vllm_extra_kwargs) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_config.hf_config.num_labels == 1
vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model),
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
tasks=MTEB_RERANK_TASKS,
languages=MTEB_RERANK_LANGS)
vllm_dtype = model_config.dtype
......@@ -300,4 +302,4 @@ def mteb_test_rerank_models(hf_runner,
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
......@@ -68,7 +68,6 @@ RERANK_MODELS = [
enable_test=False),
RerankModelInfo("BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification",
dtype="float32",
enable_test=False)
]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Optional
import numpy as np
import pytest
import torch
from tests.conftest import HfRunner
from .mteb_utils import (RerankModelInfo, VllmMtebEncoder,
mteb_test_rerank_models)
RERANK_MODELS = [
RerankModelInfo("BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification"),
]
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
class GemmaRerankerHfRunner(HfRunner):
def __init__(self,
model_name: str,
dtype: str = "auto",
*args: Any,
**kwargs: Any) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
@torch.no_grad()
def predict(self, prompts: list[list[str]], *args,
**kwargs) -> torch.Tensor:
def get_inputs(pairs, tokenizer, prompt=None):
if prompt is None:
prompt = PROMPT
sep = "\n"
prompt_inputs = tokenizer(prompt,
return_tensors=None,
add_special_tokens=False)["input_ids"]
sep_inputs = tokenizer(sep,
return_tensors=None,
add_special_tokens=False)["input_ids"]
inputs = []
for query, passage in pairs:
query_inputs = tokenizer(
f"A: {query}",
return_tensors=None,
add_special_tokens=False,
truncation=True,
)
passage_inputs = tokenizer(
f"B: {passage}",
return_tensors=None,
add_special_tokens=False,
truncation=True,
)
item = tokenizer.prepare_for_model(
[tokenizer.bos_token_id] + query_inputs["input_ids"],
sep_inputs + passage_inputs["input_ids"],
truncation="only_second",
padding=False,
return_attention_mask=False,
return_token_type_ids=False,
add_special_tokens=False,
)
item["input_ids"] = item[
"input_ids"] + sep_inputs + prompt_inputs
item["attention_mask"] = [1] * len(item["input_ids"])
inputs.append(item)
return tokenizer.pad(
inputs,
padding=True,
return_tensors="pt",
)
scores = []
for query, doc, *_ in prompts:
pairs = [(query, doc)]
inputs = get_inputs(pairs, self.tokenizer)
inputs = inputs.to(self.model.device)
_n_tokens = inputs["input_ids"].shape[1]
logits = self.model(**inputs, return_dict=True).logits
_scores = (logits[:, -1,
self.yes_loc].view(-1, ).float().sigmoid())
scores.append(_scores[0].item())
return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.prompt = PROMPT
self.query_template = "A: {query}\n"
self.document_template = "B: {doc}\n{prompt}"
def predict(
self,
sentences: list[tuple[str, str,
Optional[str]]], # query, corpus, prompt
*args,
**kwargs,
) -> np.ndarray:
_sentences = []
for query, corpus, prompt in sentences:
query = self.query_template.format(query=query)
corpus = self.document_template.format(doc=corpus, prompt=prompt)
_sentences.append((query, corpus, prompt))
return super().predict(_sentences, *args, **kwargs)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
monkeypatch) -> None:
monkeypatch.setenv("VLLM_USE_V1", "0")
assert model_info.architecture == "GemmaForSequenceClassification"
vllm_extra_kwargs: dict[str, Any] = {
"hf_overrides": {
"architectures": ["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"],
"method": "no_post_processing",
}
}
mteb_test_rerank_models(GemmaRerankerHfRunner,
vllm_runner,
model_info,
vllm_extra_kwargs,
vllm_mteb_encoder=GemmaMtebEncoder)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from typing import Optional
import os
import pytest
from vllm.config import PoolerConfig
......@@ -31,8 +31,10 @@ def v1(run_with_both_engines):
# [Decoder-only]
pytest.param(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
marks=[pytest.mark.core_model]),
pytest.param(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param(
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
# CPU v1 doesn't support sliding window
marks=[pytest.mark.core_model]),
# the qwen models interfere with each other (see PR
# https://github.com/vllm-project/vllm/pull/18720).
# To avoid this problem, for now we skip v0 since it will be
......@@ -40,11 +42,13 @@ def v1(run_with_both_engines):
pytest.param(os.path.join(models_path_prefix, "ssmits/Qwen2-7B-Instruct-embed-base"),
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
# [Encoder-only]
pytest.param(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
marks=[
pytest.mark.core_model, pytest.mark.cpu_model,
pytest.mark.skip_v1
]),
pytest.param(
os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
marks=[
# CPU only supports V1
pytest.mark.core_model,
pytest.mark.skip_v1
]),
pytest.param(os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2"),
marks=[pytest.mark.skip_v1]),
pytest.param(os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"),
......@@ -66,10 +70,6 @@ def test_models(
model,
monkeypatch,
) -> None:
if model == os.path.join(models_path_prefix,"intfloat/e5-mistral-7b-instruct") and current_platform.is_cpu(
) and os.environ.get("VLLM_USE_V1", "0") == "1":
pytest.skip("CPU V1 doesn't support sliding window")
if model == os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2") and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend
......
......@@ -2,10 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import importlib.util
from array import array
import os
import numpy as np
import openai
import pytest
from scipy.spatial.distance import cosine
......@@ -16,9 +15,6 @@ from vllm.config import ModelConfig
from ....utils import RemoteOpenAIServer
from ....utils import models_path_prefix
# GritLM embedding implementation is only supported by XFormers backend.
pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
reason="GritLM requires XFormers")
MODEL_NAME = os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm")
MAX_MODEL_LEN = 4000
......@@ -28,11 +24,11 @@ def _arr(arr):
"""
Convert a list of integers to an array of integers.
"""
return array("i", arr)
return np.array(arr)
def test_find_array():
from vllm.model_executor.models.gritlm import GritLMPooler
from vllm.model_executor.models.gritlm import GritLMMeanPool
model_config = ModelConfig(
MODEL_NAME,
......@@ -43,17 +39,19 @@ def test_find_array():
dtype="bfloat16",
seed=0,
)
pooler = GritLMPooler(model_config=model_config)
pooling = GritLMMeanPool(model_config=model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooling._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=3) == -1
assert pooling._find_array(arr, _arr([3, 4, 5]), end_idx=4) == 3
assert pooling._find_array(arr, _arr([3, 5]), start_idx=0) == -1
with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
pooling._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
def run_llm_encode(
......@@ -126,7 +124,7 @@ def test_gritlm_offline_embedding(vllm_runner):
task="embed",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
d_rep = run_llm_encode(
llm,
......@@ -173,7 +171,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
task="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params)
......
......@@ -18,11 +18,8 @@ EMBEDDING_MODELS = [
]
RERANK_MODELS = [
RerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual",
architecture="XLMRobertaForSequenceClassification",
dtype="float32",
)
RerankModelInfo("jinaai/jina-reranker-v2-base-multilingual",
architecture="XLMRobertaForSequenceClassification")
]
......@@ -90,10 +87,10 @@ def test_matryoshka(
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
assert vllm_model.model.llm_engine.model_config.is_matryoshka
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
matryoshka_dimensions = (
vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
assert matryoshka_dimensions is not None
if dimensions not in matryoshka_dimensions:
......
......@@ -12,11 +12,9 @@ from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
RERANK_MODELS = [
RerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
dtype="float32",
enable_test=True),
RerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
dtype="float32",
enable_test=False)
]
......
......@@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, task="embed",
max_model_len=None) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
......@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with vllm_runner(model_info.name, task="embed",
max_model_len=256) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256
# set 512 < max_model_len <= 2048
......@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
else:
with vllm_runner(model_info.name, task="embed",
max_model_len=1024) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024
......
......@@ -6,17 +6,16 @@ import pytest
import torch
from tests.conftest import HfRunner
from tests.utils import multi_gpu_test
from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
RERANK_MODELS = [
RerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
dtype="float32",
enable_test=True),
RerankModelInfo("Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
dtype="float32",
enable_test=False)
]
......@@ -89,3 +88,29 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
vllm_extra_kwargs)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
@multi_gpu_test(num_gpus=2)
def test_rerank_models_mteb_tp(vllm_runner,
model_info: RerankModelInfo) -> None:
assert model_info.architecture == "Qwen3ForSequenceClassification"
vllm_extra_kwargs: dict[str, Any] = {
"hf_overrides": {
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
"tensor_parallel_size": 2,
}
if model_info.name == "Qwen/Qwen3-Reranker-4B":
vllm_extra_kwargs["max_num_seqs"] = 1
mteb_test_rerank_models(Qwen3RerankerHfRunner,
vllm_runner,
model_info,
vllm_extra_kwargs,
atol=1.2e-2)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
import torch
import torch.nn.functional as F
......@@ -84,6 +86,9 @@ def test_prm_models(
dtype: str,
monkeypatch,
) -> None:
if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
pytest.skip("CPU only supports V1")
if current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention
# switch to use ROCm CK FA backend
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment