Commit 6d2051cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

parents 2c7f740a a2c71c54
...@@ -9,12 +9,9 @@ from vllm.lora.request import LoRARequest ...@@ -9,12 +9,9 @@ from vllm.lora.request import LoRARequest
MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
prompts = [ prompts: List[str]) -> List[str]:
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
"[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256) sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
outputs = llm.generate( outputs = llm.generate(
prompts, prompts,
...@@ -33,22 +30,71 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: ...@@ -33,22 +30,71 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
@pytest.mark.parametrize("tp_size", [4]) @pytest.mark.parametrize("tp_size", [4])
def test_mixtral_lora(mixtral_lora_files, tp_size): def test_mixtral_lora(mixtral_lora_files, tp_size):
"""Original test, the LoRA model has the common target modules, not all"""
if torch.cuda.device_count() < tp_size: if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm = vllm.LLM(MODEL_PATH, prompts = [
enable_lora=True, "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501
max_num_seqs=16, "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501
max_loras=4, "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501
distributed_executor_backend="ray", ]
tensor_parallel_size=tp_size)
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
distributed_executor_backend="ray",
tensor_parallel_size=tp_size,
)
expected_lora_output = [ expected_lora_output = [
"give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501 "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501
"give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])", # noqa: E501 "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])", # noqa: E501
"inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501 "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501
] ]
assert do_sample(llm, mixtral_lora_files, assert do_sample(llm, mixtral_lora_files, lora_id=1,
lora_id=1) == expected_lora_output prompts=prompts) == expected_lora_output
assert do_sample(llm, mixtral_lora_files, assert do_sample(llm, mixtral_lora_files, lora_id=2,
lora_id=2) == expected_lora_output prompts=prompts) == expected_lora_output
@pytest.mark.parametrize("tp_size", [4])
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
tp_size):
"""This LoRA model has all supported Mixtral target modules"""
if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
prompts = [
"Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:", # noqa: E501
"Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:", # noqa: E501
"Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:", # noqa: E501
]
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
distributed_executor_backend="ray",
tensor_parallel_size=tp_size,
max_lora_rank=32,
)
expected_lora_output = [
"A: Nothing happens if you touch the eyes of a blind man.",
"A: add heat",
"1: Craig",
]
assert do_sample(llm,
mixtral_lora_files_all_target_modules,
lora_id=1,
prompts=prompts) == expected_lora_output
assert do_sample(llm,
mixtral_lora_files_all_target_modules,
lora_id=2,
prompts=prompts) == expected_lora_output
...@@ -71,10 +71,10 @@ def do_sample(llm: vllm.LLM, ...@@ -71,10 +71,10 @@ def do_sample(llm: vllm.LLM,
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("tp_size", [1])
def test_quant_model_lora(tinyllama_lora_files, model, tp_size): def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
# Cannot use as it will initialize torch.cuda too early... tp_size):
# if torch.cuda.device_count() < tp_size: if num_gpus_available < tp_size:
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm = vllm.LLM( llm = vllm.LLM(
model=model.model_path, model=model.model_path,
...@@ -164,11 +164,10 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size): ...@@ -164,11 +164,10 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.skip("Requires multiple GPUs") def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
def test_quant_model_tp_equality(tinyllama_lora_files, model): model):
# Cannot use as it will initialize torch.cuda too early... if num_gpus_available < 2:
# if torch.cuda.device_count() < 2: pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
# pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
llm_tp1 = vllm.LLM( llm_tp1 = vllm.LLM(
model=model.model_path, model=model.model_path,
......
...@@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): ...@@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
lora_request) lora_request)
def test_get_lora_tokenizer(sql_lora_files, tmpdir): def test_get_lora_tokenizer(sql_lora_files, tmp_path):
lora_request = None lora_request = None
tokenizer = get_lora_tokenizer(lora_request) tokenizer = get_lora_tokenizer(lora_request)
assert not tokenizer assert not tokenizer
...@@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir): ...@@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir):
tokenizer = get_lora_tokenizer(lora_request) tokenizer = get_lora_tokenizer(lora_request)
assert tokenizer.get_added_vocab() assert tokenizer.get_added_vocab()
lora_request = LoRARequest("1", 1, str(tmpdir)) lora_request = LoRARequest("1", 1, str(tmp_path))
tokenizer = get_lora_tokenizer(lora_request) tokenizer = get_lora_tokenizer(lora_request)
assert not tokenizer assert not tokenizer
...@@ -185,13 +185,14 @@ def test_metric_spec_decode( ...@@ -185,13 +185,14 @@ def test_metric_spec_decode(
) -> None: ) -> None:
k = 5 k = 5
with vllm_runner(model, with vllm_runner(
dtype=dtype, model,
disable_log_stats=False, dtype=dtype,
gpu_memory_utilization=0.4, disable_log_stats=False,
speculative_model=model, gpu_memory_utilization=0.4,
num_speculative_tokens=k, speculative_model=model,
use_v2_block_manager=True) as vllm_model: num_speculative_tokens=k,
) as vllm_model:
# Force log interval to be 0 to catch all metrics. # Force log interval to be 0 to catch all metrics.
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
...@@ -242,7 +243,6 @@ def test_metric_spec_decode_interval( ...@@ -242,7 +243,6 @@ def test_metric_spec_decode_interval(
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
speculative_model=model, speculative_model=model,
num_speculative_tokens=k, num_speculative_tokens=k,
use_v2_block_manager=True,
enforce_eager=True) enforce_eager=True)
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
...@@ -326,7 +326,6 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool, ...@@ -326,7 +326,6 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
"vllm:e2e_request_latency_seconds", "vllm:e2e_request_latency_seconds",
"vllm:request_prompt_tokens", "vllm:request_prompt_tokens",
"vllm:request_generation_tokens", "vllm:request_generation_tokens",
"vllm:request_params_best_of",
"vllm:request_params_n", "vllm:request_params_n",
] ]
for metric_name in request_histogram_metrics: for metric_name in request_histogram_metrics:
......
import pytest
@pytest.fixture
def sample_regex():
return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
@pytest.fixture
def sample_json_schema():
return {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"age": {
"type": "integer"
},
"skills": {
"type": "array",
"items": {
"type": "string",
"maxLength": 10
},
"minItems": 3
},
"work_history": {
"type": "array",
"items": {
"type": "object",
"properties": {
"company": {
"type": "string"
},
"duration": {
"type": "number"
},
"position": {
"type": "string"
}
},
"required": ["company", "position"]
}
}
},
"required": ["name", "age", "skills", "work_history"]
}
# This unit test should be moved to a new
# tests/test_guided_decoding directory.
import pytest import pytest
import torch import torch
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.entrypoints.openai.protocol import CompletionRequest
from vllm.model_executor.guided_decoding import ( from vllm.model_executor.guided_decoding import (
get_guided_decoding_logits_processor) get_guided_decoding_logits_processor)
from vllm.model_executor.guided_decoding.outlines_logits_processors import ( from vllm.model_executor.guided_decoding.outlines_logits_processors import (
JSONLogitsProcessor, RegexLogitsProcessor) JSONLogitsProcessor, RegexLogitsProcessor)
from vllm.sampling_params import GuidedDecodingParams
def test_guided_logits_processors(sample_regex, sample_json_schema): def test_guided_logits_processors(sample_regex, sample_json_schema):
...@@ -44,11 +42,9 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex, ...@@ -44,11 +42,9 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}") f"Give an example IPv4 address with this regex: {sample_regex}")
regex_request = CompletionRequest(model='test', regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
prompt=token_ids,
guided_regex=sample_regex)
regex_lp = await get_guided_decoding_logits_processor( regex_lp = await get_guided_decoding_logits_processor(
backend, regex_request, tokenizer) regex_request, tokenizer)
assert regex_lp is not None assert regex_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
...@@ -59,14 +55,31 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex, ...@@ -59,14 +55,31 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
token_ids = tokenizer.encode( token_ids = tokenizer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}" f"Give an employee profile that fits this schema: {sample_json_schema}"
) )
json_request = CompletionRequest(model='test', json_request = GuidedDecodingParams(json=sample_json_schema,
prompt=token_ids, backend=backend)
guided_json=sample_json_schema)
json_lp = await get_guided_decoding_logits_processor( json_lp = await get_guided_decoding_logits_processor(
backend, json_request, tokenizer) json_request, tokenizer)
assert json_lp is not None assert json_lp is not None
tensor = torch.rand(32000) tensor = torch.rand(32000)
original_tensor = torch.clone(tensor) original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor) tensor = json_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor) assert not torch.allclose(tensor, original_tensor)
def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
with pytest.raises(ValueError,
match="You can only use one kind of guided"):
GuidedDecodingParams(json=sample_json_schema, regex=sample_regex)
with pytest.raises(ValueError,
match="You can only use one kind of guided"):
GuidedDecodingParams(json=sample_json_schema, json_object=True)
with pytest.raises(ValueError,
match="You can only use one kind of guided"):
GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"])
with pytest.raises(ValueError,
match="You can only use one kind of guided"):
GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
...@@ -19,12 +19,12 @@ MAX_MODEL_LEN = 1024 ...@@ -19,12 +19,12 @@ MAX_MODEL_LEN = 1024
# FIXME: Move this to confest # FIXME: Move this to confest
MODELS = [ MODELS = [
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", ("meta-llama/Llama-3.2-1B-Instruct",
hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")), filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", ("meta-llama/Llama-3.2-1B-Instruct",
hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF", hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")), filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
("Qwen/Qwen2-1.5B-Instruct", ("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF", hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="qwen2-1_5b-instruct-q4_k_m.gguf")), filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
Run `pytest tests/models/test_granite.py`. Run `pytest tests/models/test_granite.py`.
""" """
import pytest import pytest
import transformers
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
...@@ -12,9 +11,6 @@ MODELS = [ ...@@ -12,9 +11,6 @@ MODELS = [
] ]
# GraniteForCausalLM will be in transformers >= 4.45
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="granite model test requires transformers >= 4.45")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
......
"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
Run `pytest tests/models/test_granite.py`.
"""
import pytest
from ...utils import check_logprobs_close
MODELS = [
"ibm/PowerMoE-3b",
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
import pytest import pytest
from tests.utils import multi_gpu_test
from vllm.sampling_params import SamplingParams
from vllm.worker.model_runner import _get_graph_batch_size from vllm.worker.model_runner import _get_graph_batch_size
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
MODELS = ["ai21labs/Jamba-tiny-random"] MODELS = ["ai21labs/Jamba-tiny-dev"]
# Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl
# TODO: Fix this with trained model
@pytest.mark.skip()
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [10]) @pytest.mark.parametrize("max_tokens", [96])
def test_models( def test_models(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
...@@ -22,7 +21,14 @@ def test_models( ...@@ -22,7 +21,14 @@ def test_models(
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(
model,
dtype=dtype,
model_kwargs={
"use_mamba_kernels":
False, # mamba kernels are not installed so HF
# don't use them
}) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
...@@ -38,8 +44,8 @@ def test_models( ...@@ -38,8 +44,8 @@ def test_models(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [96])
def test_batching( def test_batching(
vllm_runner, vllm_runner,
example_prompts, example_prompts,
...@@ -65,6 +71,107 @@ def test_batching( ...@@ -65,6 +71,107 @@ def test_batching(
) )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [10])
def test_mamba_prefill_chunking_with_parallel_sampling(
hf_runner, vllm_runner, example_prompts, model: str, dtype: str,
max_tokens: int) -> None:
# Tests prefill chunking in conjunction with n>1, in this case,
# prefill is populated with decoding tokens and we test that it
# doesn't fail This test might fail if cache is not allocated
# correctly for n > 1 decoding steps inside a
# chunked prefill forward pass (where we have both prefills
# and decoding together )
sampling_params = SamplingParams(n=3,
temperature=1,
seed=0,
max_tokens=max_tokens)
with vllm_runner(
model,
dtype=dtype,
enable_chunked_prefill=True,
max_num_batched_tokens=30,
max_num_seqs=10 # forces prefill chunks with decoding
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10])
def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
model: str, dtype: str,
max_tokens: int) -> None:
# numeric error during prefill chucking produces different generation
# compared to w/o prefill chunking for those examples, removed them for now
example_prompts.pop(7)
example_prompts.pop(2)
example_prompts.pop(1)
with hf_runner(
model,
dtype=dtype,
model_kwargs={
"use_mamba_kernels":
False, # mamba kernels are not installed so HF
# don't use them
}) as hf_model:
non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model,
dtype=dtype,
enable_chunked_prefill=True,
max_num_batched_tokens=5,
max_num_seqs=2) as vllm_model:
chunked = vllm_model.generate_greedy(example_prompts,
max_tokens=max_tokens)
check_outputs_equal(
outputs_0_lst=chunked,
outputs_1_lst=non_chunked,
name_0="chunked",
name_1="non_chunked",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [15])
def test_parallel_sampling(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
for_loop_outputs = []
for _ in range(10):
for_loop_outputs.append(
# using example_prompts index 1 instead of 0 since with 0 the
# logprobs get really close and the test doesn't pass
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
[0])
sampling_params = SamplingParams(n=10,
temperature=0.001,
seed=0,
max_tokens=max_tokens)
n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
sampling_params)
token_ids, texts = n_lt_1_outputs[0]
n_lt_1_outputs = [(token_id, text)
for token_id, text in zip(token_ids, texts)]
check_outputs_equal(
outputs_0_lst=n_lt_1_outputs,
outputs_1_lst=for_loop_outputs,
name_0="vllm_n_lt_1_outputs",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [20]) @pytest.mark.parametrize("max_tokens", [20])
...@@ -164,6 +271,30 @@ def test_state_cleanup( ...@@ -164,6 +271,30 @@ def test_state_cleanup(
"could be related to finished_requests_ids") "could be related to finished_requests_ids")
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64])
def test_jamba_distributed_produces_identical_generation(
vllm_runner, model: str, dtype: str, max_tokens: int,
example_prompts) -> None:
with vllm_runner(model, dtype=dtype, tensor_parallel_size=2) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
max_tokens)
with vllm_runner(model, dtype=dtype, tensor_parallel_size=1) as vllm_model:
vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_outputs_tp_1,
outputs_1_lst=vllm_outputs_tp_2,
name_0="vllm_tp_1",
name_1="vllm_tp_2",
)
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
def test_model_print( def test_model_print(
......
"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
Run `pytest tests/models/test_mamba.py`.
"""
import pytest
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm.sampling_params import SamplingParams
from vllm.worker.model_runner import _get_graph_batch_size
from ...utils import check_outputs_equal
MODELS = ["state-spaces/mamba-130m-hf"]
# Use lower-level interfaces to create this greedy generator, as mamba will
# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
def generate_greedy(model_name, example_prompts, max_tokens):
# Create a text generation pipeline
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Generate texts from the prompts
outputs = []
for prompt in example_prompts:
# Tokenize the input prompt with truncation
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
input_ids = inputs["input_ids"].to(model.device)
# Generate text using the model's generate method directly
generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
generated_text = tokenizer.decode(generated_ids[0],
skip_special_tokens=True)
outputs.append((generated_ids[0].tolist(), generated_text))
return outputs
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [96])
def test_models(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
hf_outputs = generate_greedy(model, example_prompts, max_tokens)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [96])
def test_batching(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# To pass the small model tests, we need full precision.
for_loop_outputs = []
with vllm_runner(model, dtype=dtype) as vllm_model:
for prompt in example_prompts:
for_loop_outputs.append(
vllm_model.generate_greedy([prompt], max_tokens)[0])
batched_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=for_loop_outputs,
outputs_1_lst=batched_outputs,
name_0="for_loop_vllm",
name_1="batched_vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [10])
def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
model: str, dtype: str,
max_tokens: int) -> None:
# Tests chunked prefill in conjunction with n>1. In this case, prefill is
# populated with decoding tokens and we test that it doesn't fail.
# This test might fail if cache is not allocated correctly for n > 1
# decoding steps inside a chunked prefill forward pass (where we have both
# prefill and decode together )
sampling_params = SamplingParams(n=3,
temperature=1,
seed=0,
max_tokens=max_tokens)
with vllm_runner(
model,
dtype=dtype,
enable_chunked_prefill=True,
max_num_batched_tokens=30,
max_num_seqs=10 # forces prefill chunks with decoding
) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
max_tokens: int,
chunked_prefill_token_size: int) -> None:
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size
non_chunked = generate_greedy(model, example_prompts, max_tokens)
with vllm_runner(model,
dtype=dtype,
enable_chunked_prefill=True,
max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_seqs) as vllm_model:
chunked = vllm_model.generate_greedy(example_prompts,
max_tokens=max_tokens)
check_outputs_equal(
outputs_0_lst=chunked,
outputs_1_lst=non_chunked,
name_0="chunked",
name_1="non_chunked",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [15])
def test_parallel_sampling(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
for_loop_outputs = []
for _ in range(10):
for_loop_outputs.append(
# using example_prompts index 1 instead of 0 since with 0 the
# logprobs get really close and the test doesn't pass
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
[0])
sampling_params = SamplingParams(n=10,
temperature=0.001,
seed=0,
max_tokens=max_tokens)
n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
sampling_params)
token_ids, texts = n_lt_1_outputs[0]
n_lt_1_outputs = [(token_id, text)
for token_id, text in zip(token_ids, texts)]
check_outputs_equal(
outputs_0_lst=n_lt_1_outputs,
outputs_1_lst=for_loop_outputs,
name_0="vllm_n_lt_1_outputs",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [20])
def test_mamba_cache_cg_padding(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# This test is for verifying that mamba cache is padded to CG captured
# batch size. If it's not, a torch RuntimeError will be raised because
# tensor dimensions aren't compatible
while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
example_prompts.append(example_prompts[0])
try:
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
except RuntimeError:
pytest.fail(
"Couldn't run batch size which is not equal to a Cuda Graph "
"captured batch size. "
"Could be related to mamba cache not padded correctly")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [20])
def test_models_preemption_recompute(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
# Tests that outputs are identical with and w/o preemtions (recompute)
assert dtype == "float"
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_model.model.llm_engine.scheduler[
0].ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)
vllm_model.model.llm_engine.scheduler[
0].ENABLE_ARTIFICIAL_PREEMPT = False
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=preempt_vllm_outputs,
outputs_1_lst=vllm_outputs,
name_0="vllm_preepmtions",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
vllm_runner,
model: str,
dtype: str,
example_prompts,
) -> None:
# This test is for verifying that the Mamba inner state management doesn't
# collapse in case where the number of incoming requests and
# finished_requests_ids is larger than the maximum Mamba block capacity.
# This could generally happen due to the fact that Mamba does support
# statelessness mechanism where it can cleanup new incoming requests in
# a single step.
try:
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError:
pytest.fail("Mamba inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily ")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_state_cleanup(
vllm_runner,
model: str,
dtype: str,
example_prompts,
) -> None:
# This test is for verifying that the Mamba state is cleaned up between
# steps, If its not cleaned, an error would be expected.
try:
with vllm_runner(model, dtype=dtype) as vllm_model:
for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError:
pytest.fail("Mamba inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from vllm.utils import is_cpu from vllm.utils import is_cpu
from ....utils import large_gpu_test
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
MODELS = [ MODELS = [
...@@ -69,20 +70,10 @@ def test_phimoe_routing_function(): ...@@ -69,20 +70,10 @@ def test_phimoe_routing_function():
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
def get_gpu_memory():
try:
props = torch.cuda.get_device_properties(torch.cuda.current_device())
gpu_memory = props.total_memory / (1024**3)
return gpu_memory
except Exception:
return 0
@pytest.mark.skipif(condition=is_cpu(), @pytest.mark.skipif(condition=is_cpu(),
reason="This test takes a lot time to run on CPU, " reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.") "and vllm CI's disk space is not enough for this model.")
@pytest.mark.skipif(condition=get_gpu_memory() < 100, @large_gpu_test(min_gb=80)
reason="Skip this test if GPU memory is insufficient.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
......
...@@ -65,8 +65,8 @@ def run_test( ...@@ -65,8 +65,8 @@ def run_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
max_model_len=2560, max_model_len=2048,
max_num_seqs=1, max_num_seqs=2,
dtype=dtype, dtype=dtype,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
...@@ -80,8 +80,6 @@ def run_test( ...@@ -80,8 +80,6 @@ def run_test(
] ]
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
eos_token_id = hf_model.processor.tokenizer.eos_token_id eos_token_id = hf_model.processor.tokenizer.eos_token_id
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
......
from typing import List, Optional, Tuple, Type
import pytest
from vllm.multimodal.utils import rescale_image_size
from vllm.transformers_utils.tokenizer import patch_padding_side
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"What's the content of the image?",
"cherry_blossom":
"What is the season?",
})
models = ["THUDM/glm-4v-9b"]
target_dtype = "bfloat16"
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
inputs: List[Tuple[List[str], PromptImageInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
max_model_len=2048,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
stop_token_ids = [151329, 151336, 151338]
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
stop_token_ids=stop_token_ids)
for prompts, images in inputs
]
with hf_runner(model, dtype=dtype) as hf_model:
hf_processor = hf_model.processor
patch_padding_side(hf_processor)
def processor(*args, text="", images=None, **kwargs):
if images is None:
return hf_processor(*args, **kwargs)
return hf_processor.apply_chat_template(
[{
"role": "user",
"image": images,
"content": text
}],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
**kwargs,
)
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
) for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
...@@ -97,7 +97,8 @@ def run_test( ...@@ -97,7 +97,8 @@ def run_test(
self.tokenizer = hf_runner.tokenizer self.tokenizer = hf_runner.tokenizer
self.dtype = hf_runner.model.dtype self.dtype = hf_runner.model.dtype
self.config = AutoConfig.from_pretrained(hf_runner.model_name) self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.vision_config = self.config.vision_config self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch self.min_num = self.config.min_dynamic_patch
......
from typing import List, Optional, Tuple, Type, overload from typing import List, Optional, Tuple, Type, overload
import pytest import pytest
import transformers
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import (rescale_video_size, resize_video, from vllm.multimodal.utils import (rescale_video_size, resize_video,
...@@ -158,8 +157,6 @@ def run_test( ...@@ -158,8 +157,6 @@ def run_test(
) )
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"size_factors", "size_factors",
...@@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors, ...@@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
) )
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"sizes", "sizes",
......
from typing import List, Optional, Tuple, Type, overload from typing import List, Optional, Tuple, Type, overload
import pytest import pytest
import transformers
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
BatchEncoding) BatchEncoding)
...@@ -12,13 +11,13 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE ...@@ -12,13 +11,13 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_VideoAssets) _VideoAssets)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
# Video test # Video test
HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
"sample_demo_1": "sample_demo_1":
"<|im_start|>user <video>\nwhy is this video funny? \ "<|im_start|>user\n<video>\nwhy is this video funny?<|im_end|>\n<|im_start|>assistant\n" # noqa: E501
<|im_end|><|im_start|>assistant\n"
}) })
models = ["llava-hf/llava-onevision-qwen2-7b-ov-hf"] models = ["llava-hf/llava-onevision-qwen2-7b-ov-hf"]
...@@ -166,8 +165,7 @@ def run_video_test( ...@@ -166,8 +165,7 @@ def run_video_test(
) )
@pytest.mark.skipif(transformers.__version__ < "4.45", @large_gpu_test(min_gb=48)
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"size_factors", "size_factors",
...@@ -211,8 +209,7 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors, ...@@ -211,8 +209,7 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
) )
@pytest.mark.skipif(transformers.__version__ < "4.45", @large_gpu_test(min_gb=48)
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"sizes", "sizes",
...@@ -259,7 +256,8 @@ def run_image_test( ...@@ -259,7 +256,8 @@ def run_image_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
dtype=dtype, dtype=dtype,
max_model_len=32768, max_model_len=16384,
max_num_seqs=2,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=True, enforce_eager=True,
...@@ -305,8 +303,7 @@ def run_image_test( ...@@ -305,8 +303,7 @@ def run_image_test(
) )
@pytest.mark.skipif(transformers.__version__ < "4.45", @large_gpu_test(min_gb=48)
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
...@@ -319,14 +316,10 @@ def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, ...@@ -319,14 +316,10 @@ def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
inputs = [( inputs = [(
[ [
"<|im_start|>user <image><image>\nDescribe 2 images. \ "<|im_start|>user\n<image><image>\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
<|im_end|><|im_start|>assistant\n", "<|im_start|>user\n<image><image>\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
"<|im_start|>user <image><image>\nDescribe 2 images. \ "<|im_start|>user\n<image><image><image><image>\nDescribe 4 images.<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
<|im_end|><|im_start|>assistant\n", "<|im_start|>user\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
"<|im_start|>user <image><image><image><image>\nDescribe 4 images. \
<|im_end|><|im_start|>assistant\n",
"<|im_start|>user <image>\nWhat is the season? \
<|im_end|><|im_start|>assistant\n",
], ],
[ [
[stop_sign, cherry_blossom], [stop_sign, cherry_blossom],
......
...@@ -79,7 +79,7 @@ def run_test( ...@@ -79,7 +79,7 @@ def run_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
max_model_len=4096, max_model_len=4096,
max_num_seqs=1, max_num_seqs=2,
dtype=dtype, dtype=dtype,
limit_mm_per_prompt={"image": mm_limit}, limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
......
import os import os
import re import re
from typing import Callable, List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pytest import pytest
import torch import torch
from transformers import AutoImageProcessor, AutoTokenizer from transformers import AutoImageProcessor, AutoTokenizer
from vllm.inputs import InputContext, LLMInputs from vllm.inputs import InputContext, token_inputs
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
from vllm.multimodal import MultiModalRegistry from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
...@@ -90,7 +90,7 @@ def run_test( ...@@ -90,7 +90,7 @@ def run_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
max_model_len=4096, max_model_len=4096,
max_num_seqs=1, max_num_seqs=2,
dtype=dtype, dtype=dtype,
limit_mm_per_prompt={"image": mm_limit}, limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
...@@ -311,7 +311,7 @@ def test_input_mapper_override(model: str, image_assets: _ImageAssets, ...@@ -311,7 +311,7 @@ def test_input_mapper_override(model: str, image_assets: _ImageAssets,
(4, 781), (4, 781),
(16, 2653), (16, 2653),
]) ])
def test_max_tokens_override(get_max_phi3v_image_tokens: Callable, model: str, def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
num_crops: int, expected_max_tokens: int): num_crops: int, expected_max_tokens: int):
"""Ensure get_max_phi3v_image_tokens handles num_crops properly.""" """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
# NOTE: mm_processor_kwargs on the context in this test is unused, since # NOTE: mm_processor_kwargs on the context in this test is unused, since
...@@ -343,8 +343,8 @@ def test_max_tokens_override(get_max_phi3v_image_tokens: Callable, model: str, ...@@ -343,8 +343,8 @@ def test_max_tokens_override(get_max_phi3v_image_tokens: Callable, model: str,
(16, 2653, 1), (16, 2653, 1),
(16, 2653, 2), (16, 2653, 2),
]) ])
def test_dummy_data_override(dummy_data_for_phi3v: Callable, model: str, def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
num_crops: int, toks_per_img: int, num_imgs: int): toks_per_img: int, num_imgs: int):
"""Ensure dummy_data_for_phi3v handles num_crops properly.""" """Ensure dummy_data_for_phi3v handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs # Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by # in this test and assume that the kwargs will be correctly expanded by
...@@ -374,7 +374,7 @@ def test_dummy_data_override(dummy_data_for_phi3v: Callable, model: str, ...@@ -374,7 +374,7 @@ def test_dummy_data_override(dummy_data_for_phi3v: Callable, model: str,
(16, 1921, 1), (16, 1921, 1),
(16, 1921, 2), (16, 1921, 2),
]) ])
def test_input_processor_override(input_processor_for_phi3v: Callable, def test_input_processor_override(input_processor_for_phi3v,
image_assets: _ImageAssets, model: str, image_assets: _ImageAssets, model: str,
num_crops: int, expected_toks_per_img: int, num_crops: int, expected_toks_per_img: int,
num_imgs: int): num_imgs: int):
...@@ -393,16 +393,14 @@ def test_input_processor_override(input_processor_for_phi3v: Callable, ...@@ -393,16 +393,14 @@ def test_input_processor_override(input_processor_for_phi3v: Callable,
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
images = [image_assets[0].pil_image] * num_imgs images = [image_assets[0].pil_image] * num_imgs
llm_inputs = LLMInputs(prompt_token_ids=tokenizer.encode(prompt), inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
prompt=prompt, prompt=prompt,
multi_modal_data={"image": images}) multi_modal_data={"image": images})
proc_llm_inputs = input_processor_for_phi3v( processed_inputs = input_processor_for_phi3v(ctx,
ctx=ctx, inputs,
llm_inputs=llm_inputs, num_crops=num_crops)
num_crops=num_crops,
)
# Ensure we have the right number of placeholders per num_crops size # Ensure we have the right number of placeholders per num_crops size
img_tok_count = proc_llm_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
assert img_tok_count == expected_toks_per_img * num_imgs assert img_tok_count == expected_toks_per_img * num_imgs
...@@ -17,7 +17,7 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt ...@@ -17,7 +17,7 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
from vllm.multimodal import MultiModalDataBuiltins from vllm.multimodal import MultiModalDataBuiltins
from vllm.sequence import Logprob, SampleLogprobs from vllm.sequence import Logprob, SampleLogprobs
from ....utils import VLLM_PATH from ....utils import VLLM_PATH, large_gpu_test
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -121,10 +121,7 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: ...@@ -121,10 +121,7 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
for tokens, text, logprobs in json_data] for tokens, text, logprobs in json_data]
@pytest.mark.skip( @large_gpu_test(min_gb=80)
reason=
"Model is too big, test passed on A100 locally but will OOM on CI machine."
)
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN) @pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
...@@ -157,10 +154,7 @@ def test_chat( ...@@ -157,10 +154,7 @@ def test_chat(
name_1="output") name_1="output")
@pytest.mark.skip( @large_gpu_test(min_gb=80)
reason=
"Model is too big, test passed on A100 locally but will OOM on CI machine."
)
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
def test_model_engine(vllm_runner, model: str, dtype: str) -> None: def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment