Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
...@@ -15,7 +15,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", os.path.join(models_path_prefix, "BAAI ...@@ -15,7 +15,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", os.path.join(models_path_prefix, "BAAI
REVISION = os.environ.get("REVISION", "main") REVISION = os.environ.get("REVISION", "main")
MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
os.path.join(models_path_prefix, "intfloat/multilingual-e5-large")) os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"))
REVISION_ROBERTA = os.environ.get("REVISION", "main") REVISION_ROBERTA = os.environ.get("REVISION", "main")
...@@ -84,7 +84,7 @@ def test_roberta_model_loading_with_params(vllm_runner): ...@@ -84,7 +84,7 @@ def test_roberta_model_loading_with_params(vllm_runner):
assert model_config.pooler_config.pooling_norm assert model_config.pooler_config.pooling_norm
# asserts on the tokenizer loaded # asserts on the tokenizer loaded
assert model_tokenizer.tokenizer_id == os.path.join(models_path_prefix, "intfloat/multilingual-e5-large") assert model_tokenizer.tokenizer_id == os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
assert not model_tokenizer.tokenizer_config["do_lower_case"] assert not model_tokenizer.tokenizer_config["do_lower_case"]
def check_model(model): def check_model(model):
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Tuple, Type from typing import Optional
import numpy as np import numpy as np
import pytest import pytest
import os import os
import pytest_asyncio import pytest_asyncio
from transformers import AutoModel, AutoTokenizer, BatchEncoding from transformers import AutoModel, AutoTokenizer
from vllm.multimodal.audio import resample_audio from vllm.multimodal.audio import resample_audio
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import HfRunner, VllmRunner from ....conftest import HfRunner, VllmRunner
...@@ -20,7 +19,7 @@ from ...utils import check_logprobs_close ...@@ -20,7 +19,7 @@ from ...utils import check_logprobs_close
MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b") MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
AudioTuple = Tuple[np.ndarray, int] AudioTuple = tuple[np.ndarray, int]
VLLM_PLACEHOLDER = "<|audio|>" VLLM_PLACEHOLDER = "<|audio|>"
HF_PLACEHOLDER = "<|audio|>" HF_PLACEHOLDER = "<|audio|>"
...@@ -81,7 +80,7 @@ def _get_prompt(audio_count, question, placeholder): ...@@ -81,7 +80,7 @@ def _get_prompt(audio_count, question, placeholder):
add_generation_prompt=True) add_generation_prompt=True)
def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def vllm_to_hf_output(vllm_output: tuple[list[int], str,
Optional[SampleLogprobs]], Optional[SampleLogprobs]],
model: str): model: str):
"""Sanitize vllm output to be comparable with hf output.""" """Sanitize vllm output to be comparable with hf output."""
...@@ -99,9 +98,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ...@@ -99,9 +98,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
def run_test( def run_test(
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
prompts_and_audios: List[Tuple[str, str, AudioTuple]], prompts_and_audios: list[tuple[str, str, AudioTuple]],
model: str, model: str,
*, *,
dtype: str, dtype: str,
...@@ -110,8 +109,6 @@ def run_test( ...@@ -110,8 +109,6 @@ def run_test(
**kwargs, **kwargs,
): ):
"""Inference result should be the same between hf and vllm.""" """Inference result should be the same between hf and vllm."""
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
...@@ -127,15 +124,7 @@ def run_test( ...@@ -127,15 +124,7 @@ def run_test(
for vllm_prompt, _, audio in prompts_and_audios for vllm_prompt, _, audio in prompts_and_audios
] ]
def process(hf_inputs: BatchEncoding, **kwargs): with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
.to(torch_dtype) # type: ignore
return hf_inputs
with hf_runner(model,
dtype=dtype,
postprocess_inputs=process,
auto_cls=AutoModel) as hf_model:
hf_outputs_per_audio = [ hf_outputs_per_audio = [
hf_model.generate_greedy_logprobs_limit( hf_model.generate_greedy_logprobs_limit(
[hf_prompt], [hf_prompt],
...@@ -161,8 +150,8 @@ def run_test( ...@@ -161,8 +150,8 @@ def run_test(
def run_multi_audio_test( def run_multi_audio_test(
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
prompts_and_audios: List[Tuple[str, List[AudioTuple]]], prompts_and_audios: list[tuple[str, list[AudioTuple]]],
model: str, model: str,
*, *,
dtype: str, dtype: str,
...@@ -190,7 +179,7 @@ def run_multi_audio_test( ...@@ -190,7 +179,7 @@ def run_multi_audio_test(
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("vllm_kwargs", [ @pytest.mark.parametrize("vllm_kwargs", [
......
...@@ -11,12 +11,12 @@ import pytest ...@@ -11,12 +11,12 @@ import pytest
from tests.kernels.utils import override_backend_env_variable from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true"
@pytest.mark.quant_model @pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
...@@ -55,42 +55,107 @@ def test_models( ...@@ -55,42 +55,107 @@ def test_models(
backend: str, backend: str,
tensor_parallel_size: int, tensor_parallel_size: int,
disable_async_output_proc: bool, disable_async_output_proc: bool,
monkeypatch, monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
m.setenv(STR_BACKEND_ENV_VAR, backend)
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
with vllm_runner(
base_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(
test_model,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close(
outputs_0_lst=baseline_outputs,
outputs_1_lst=test_outputs,
name_0="fp16_kv_cache",
name_1="fp8_kv_cache",
)
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(),
reason="test for the CPU backend.")
@pytest.mark.parametrize(
"kv_cache_dtype,base_model,test_model",
[
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct"),
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
# Due to low-precision numerical divergence, this test is too sensitive for
# the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True])
def test_cpu_models(
vllm_runner,
example_prompts,
kv_cache_dtype: str,
base_model: str,
test_model: str,
max_tokens: int,
disable_async_output_proc: bool,
monkeypatch: pytest.MonkeyPatch,
) -> None: ) -> None:
""" """
Only checks log probs match to cover the discrepancy in Only checks log probs match to cover the discrepancy in
numerical sensitive kernels. numerical sensitive kernels.
""" """
override_backend_env_variable(monkeypatch, backend) with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", 'true')
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8 MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
with vllm_runner(
base_model, with vllm_runner(
max_model_len=MAX_MODEL_LEN, base_model,
tensor_parallel_size=tensor_parallel_size, max_model_len=MAX_MODEL_LEN,
enforce_eager=enforce_eager, dtype="bfloat16",
kv_cache_dtype="auto", kv_cache_dtype="auto",
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
) as vllm_model: ) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs( baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner( with vllm_runner(
test_model, test_model,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tensor_parallel_size, dtype="bfloat16",
enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype,
kv_cache_dtype=kv_cache_dtype, disable_async_output_proc=disable_async_output_proc,
disable_async_output_proc=disable_async_output_proc, ) as vllm_model:
) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs(
test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS)
example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close(
check_logprobs_close( outputs_0_lst=baseline_outputs,
outputs_0_lst=baseline_outputs, outputs_1_lst=test_outputs,
outputs_1_lst=test_outputs, name_0="bf16_kv_cache",
name_0="fp16_kv_cache", name_1="fp8_kv_cache",
name_1="fp8_kv_cache", )
)
...@@ -5,7 +5,7 @@ Note: To pass the test, quantization higher than Q4 should be used ...@@ -5,7 +5,7 @@ Note: To pass the test, quantization higher than Q4 should be used
""" """
import os import os
from typing import List, NamedTuple, Type from typing import NamedTuple
import pytest import pytest
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
...@@ -91,8 +91,8 @@ MODELS = [ ...@@ -91,8 +91,8 @@ MODELS = [
@pytest.mark.parametrize("tp_size", [1, 2]) @pytest.mark.parametrize("tp_size", [1, 2])
def test_models( def test_models(
num_gpus_available: int, num_gpus_available: int,
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
example_prompts: List[str], example_prompts: list[str],
model: GGUFTestConfig, model: GGUFTestConfig,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
...@@ -111,16 +111,6 @@ def test_models( ...@@ -111,16 +111,6 @@ def test_models(
example_prompts = tokenizer.apply_chat_template( example_prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True) messages, tokenize=False, add_generation_prompt=True)
# Run unquantized model.
with vllm_runner(
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
# Run gguf model. # Run gguf model.
with vllm_runner(model_name=model.gguf_model, with vllm_runner(model_name=model.gguf_model,
enforce_eager=True, enforce_eager=True,
...@@ -131,6 +121,16 @@ def test_models( ...@@ -131,6 +121,16 @@ def test_models(
gguf_outputs = gguf_model.generate_greedy_logprobs( gguf_outputs = gguf_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs) example_prompts[:-1], max_tokens, num_logprobs)
# Run unquantized model.
with vllm_runner(
model_name=model.original_model,
enforce_eager=True, # faster tests
dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as original_model:
original_outputs = original_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=original_outputs, outputs_0_lst=original_outputs,
outputs_1_lst=gguf_outputs, outputs_1_lst=gguf_outputs,
......
...@@ -11,7 +11,9 @@ from ...utils import check_outputs_equal ...@@ -11,7 +11,9 @@ from ...utils import check_outputs_equal
from ....utils import models_path_prefix from ....utils import models_path_prefix
# This test is for the hybrid models # This test is for the hybrid models
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B")] MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "Zyphra/Zamba2-1.2B-instruct")]
# Bamba at Fp32 is too big for the CI (L4 GPU).
# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -27,29 +29,24 @@ def test_models( ...@@ -27,29 +29,24 @@ def test_models(
) -> None: ) -> None:
# numeric error produces different generation # numeric error produces different generation
if 'Bamba' in model: if "Bamba" in model:
example_prompts.pop(3) example_prompts.pop(3)
with hf_runner( model_kwargs = {
model, "use_mamba_kernels": False, # mamba kernels are not installed so HF
dtype=dtype, # don't use them
model_kwargs={ }
"use_mamba_kernels": if "Zamba2" in model:
False, # mamba kernels are not installed so HF # Zamba2 HF implementation automatically checks if mamba kernels are
# don't use them # installed
}) as hf_model: model_kwargs = {}
with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i]
...@@ -119,26 +116,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling( ...@@ -119,26 +116,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts, def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
model: str, dtype: str, model: str, dtype: str,
max_tokens: int) -> None: max_tokens: int) -> None:
# numeric error during prefill chucking produces different generation # numeric error during prefill chunking produces different generation
# compared to w/o prefill chunking for those examples, removed them for now # compared to w/o prefill chunking for those examples, removed them for now
if 'Jamba' in model: if "Jamba" in model:
example_prompts.pop(7) example_prompts.pop(7)
example_prompts.pop(2) example_prompts.pop(2)
example_prompts.pop(1) example_prompts.pop(1)
elif 'Bamba' in model: elif "Bamba" in model:
example_prompts.pop(6) example_prompts.pop(6)
example_prompts.pop(3) example_prompts.pop(3)
example_prompts.pop(2) example_prompts.pop(2)
dtype = "half" # use a different dtype for Bamba dtype = "half" # use a different dtype for Bamba
elif "Zamba2" in model:
with hf_runner( example_prompts.pop(7)
model, dtype = "half"
dtype=dtype,
model_kwargs={ model_kwargs = {
"use_mamba_kernels": "use_mamba_kernels": False, # mamba kernels are not installed so HF
False, # mamba kernels are not installed so HF # don't use them
# don't use them }
}) as hf_model: if "Zamba2" in model:
# Zamba2 HF implementation automatically checks if mamba kernels are
# installed
model_kwargs = {}
with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
non_chunked = hf_model.generate_greedy(example_prompts, max_tokens) non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model, with vllm_runner(model,
...@@ -194,6 +196,7 @@ def test_parallel_sampling( ...@@ -194,6 +196,7 @@ def test_parallel_sampling(
) )
@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [20]) @pytest.mark.parametrize("max_tokens", [20])
...@@ -295,6 +298,7 @@ def test_state_cleanup( ...@@ -295,6 +298,7 @@ def test_state_cleanup(
"could be related to finished_requests_ids") "could be related to finished_requests_ids")
@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
def test_multistep( def test_multistep(
...@@ -310,6 +314,7 @@ def test_multistep( ...@@ -310,6 +314,7 @@ def test_multistep(
vllm_model.generate_greedy([example_prompts[0]] * 10, 1) vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
......
...@@ -70,13 +70,6 @@ def test_models( ...@@ -70,13 +70,6 @@ def test_models(
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model: with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
for i in range(len(example_prompts)): for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i] hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i] vllm_output_ids, vllm_output_str = vllm_outputs[i]
......
...@@ -203,6 +203,7 @@ def test_models( ...@@ -203,6 +203,7 @@ def test_models(
) )
@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
...@@ -215,16 +216,6 @@ def test_mistral_format( ...@@ -215,16 +216,6 @@ def test_mistral_format(
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
) -> None: ) -> None:
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
) as hf_format_model:
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
...@@ -235,6 +226,16 @@ def test_mistral_format( ...@@ -235,6 +226,16 @@ def test_mistral_format(
mistral_format_outputs = mistral_format_model.generate_greedy_logprobs( mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="auto",
load_format="safetensors",
config_format="hf",
) as hf_format_model:
hf_format_outputs = hf_format_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_format_outputs, outputs_0_lst=hf_format_outputs,
outputs_1_lst=mistral_format_outputs, outputs_1_lst=mistral_format_outputs,
...@@ -263,6 +264,7 @@ def test_mistral_symbolic_languages( ...@@ -263,6 +264,7 @@ def test_mistral_symbolic_languages(
assert "�" not in outputs[0].outputs[0].text.strip() assert "�" not in outputs[0].outputs[0].text.strip()
@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("model", @pytest.mark.parametrize("model",
MISTRAL_FORMAT_MODELS) # v1 can't do func calling MISTRAL_FORMAT_MODELS) # v1 can't do func calling
......
...@@ -5,7 +5,6 @@ ...@@ -5,7 +5,6 @@
Note: these tests will only pass on H100 Note: these tests will only pass on H100
""" """
import os import os
from typing import List
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
...@@ -66,7 +65,7 @@ def test_models(example_prompts, model_name) -> None: ...@@ -66,7 +65,7 @@ def test_models(example_prompts, model_name) -> None:
for prompt in example_prompts for prompt in example_prompts
] ]
params = SamplingParams(max_tokens=20, temperature=0) params = SamplingParams(max_tokens=20, temperature=0)
generations: List[str] = [] generations: list[str] = []
# Note: these need to be run 1 at a time due to numerical precision, # Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way. # since the expected strs were generated this way.
for prompt in formatted_prompts: for prompt in formatted_prompts:
......
...@@ -3,13 +3,37 @@ ...@@ -3,13 +3,37 @@
Run `pytest tests/models/test_models.py`. Run `pytest tests/models/test_models.py`.
""" """
import pytest import pytest
import os import os
import torch
from vllm.platforms import current_platform
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix from ....utils import models_path_prefix
# These have unsupported head_dim for FA. We do not
# not have a clean way to fall back, so we fail with
# a clear msg when it happens.
# https://github.com/vllm-project/vllm/issues/14524
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
# This list contains the model that are using AITER kernel.
# Skip model that are not using AITER tests.
# When more AITER kernels are added, this list will not be
# needed as all the models will be calling AITER kernels
# in parts of the operators
AITER_MODEL_LIST = [
"meta-llama/Llama-3.2-1B-Instruct",
"openbmb/MiniCPM3-4B",
"Qwen/Qwen-7B",
"Qwen/Qwen2.5-0.5B-Instruct",
"ehristoforu/Falcon3-MoE-2x7B-Insruct",
]
# @maybe_test_rocm_aiter
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", "model",
[ [
...@@ -65,15 +89,23 @@ from ....utils import models_path_prefix ...@@ -65,15 +89,23 @@ from ....utils import models_path_prefix
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
def test_models( @pytest.mark.parametrize(
hf_runner, "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
vllm_runner, def test_models(hf_runner, vllm_runner, example_prompts, model: str,
example_prompts, dtype: str, max_tokens: int, num_logprobs: int,
model: str, use_rocm_aiter: bool, monkeypatch) -> None:
dtype: str,
max_tokens: int, if model in REQUIRES_V0:
num_logprobs: int, monkeypatch.setenv("VLLM_USE_V1", "0")
) -> None:
if use_rocm_aiter and (model in AITER_MODEL_LIST):
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
# Skip model that are not using AITER tests.
# When more AITER kernels are added, this list will not be
# needed as all the models will be calling AITER kernels
# in parts of the operators
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
if model.startswith("THUDM/chatglm3"): if model.startswith("THUDM/chatglm3"):
...@@ -87,16 +119,16 @@ def test_models( ...@@ -87,16 +119,16 @@ def test_models(
vllm_outputs = vllm_model.generate_greedy_logprobs( vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)
vllm_model.apply_model(print_model)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
if use_rocm_aiter:
# this is to ensure that vllm engine
# has deallocated the memory before running the next
# unit tests. On ROCm, when using AITER
# the memory might not be deallocated completely
# before running the next test case
torch.cuda.synchronize()
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa
"""Tests Model Optimizer nvfp4 models against ground truth generation
Note: these tests will only pass on B200
"""
import os
from typing import List
import pytest
from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024
MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
EXPECTED_STRS_MAP = {
"nvidia/Llama-3.3-70B-Instruct-FP4": [
'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
]
}
# This test compares against golden strings for exact match since
# there is no baseline implementation to compare against
# and is unstable w.r.t specifics of the fp4 implementation or
# the hardware being run on.
# Disabled to prevent it from breaking the build
@pytest.mark.skip(
reason=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
reason="nvfp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
model = LLM(
model=model_name,
max_model_len=MAX_MODEL_LEN,
trust_remote_code=True,
enforce_eager=True,
quantization="nvfp4",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
formatted_prompts = [
tokenizer.apply_chat_template([{
"role": "user",
"content": prompt
}],
tokenize=False,
add_generation_prompt=True)
for prompt in example_prompts
]
params = SamplingParams(max_tokens=20, temperature=0)
generations: List[str] = []
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for prompt in formatted_prompts:
outputs = model.generate(prompt, params)
generations.append(outputs[0].outputs[0].text)
del model
print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name]
for i in range(len(example_prompts)):
generated_str = generations[i]
expected_str = expected_strs[i]
assert expected_str == generated_str, (
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Type from typing import Optional
import os import os
import pytest import pytest
...@@ -21,12 +21,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -21,12 +21,12 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
def run_awq_test( def run_awq_test(
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, image_assets: _ImageAssets,
source_model: str, source_model: str,
quant_model: str, quant_model: str,
*, *,
size_factors: List[float], size_factors: list[float],
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
...@@ -110,7 +110,12 @@ def run_awq_test( ...@@ -110,7 +110,12 @@ def run_awq_test(
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode() @torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, source_model, quant_model, def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
size_factors, dtype, max_tokens, num_logprobs) -> None: size_factors, dtype, max_tokens, num_logprobs,
monkeypatch) -> None:
# Test V1: this test hangs during setup on single-scale input.
# TODO: fixure out why and re-enable this on V1.
monkeypatch.setenv("VLLM_USE_V1", "0")
run_awq_test( run_awq_test(
vllm_runner, vllm_runner,
image_assets, image_assets,
......
...@@ -6,12 +6,11 @@ import math ...@@ -6,12 +6,11 @@ import math
import os import os
from collections import defaultdict from collections import defaultdict
from pathlib import PosixPath from pathlib import PosixPath
from typing import Type
import os import os
import pytest import pytest
from packaging.version import Version from packaging.version import Version
from transformers import AutoModelForVision2Seq from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
from transformers import __version__ as TRANSFORMERS_VERSION from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -19,7 +18,7 @@ from vllm.utils import identity ...@@ -19,7 +18,7 @@ from vllm.utils import identity
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets, from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
_VideoAssets) _VideoAssets)
from ....utils import (fork_new_process_for_each_test, large_gpu_mark, from ....utils import (create_new_process_for_each_test, large_gpu_mark,
multi_gpu_marks) multi_gpu_marks)
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners from .vlm_utils import custom_inputs, model_utils, runners
...@@ -35,6 +34,16 @@ from ....utils import models_path_prefix ...@@ -35,6 +34,16 @@ from ....utils import models_path_prefix
if current_platform.is_rocm(): if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
REQUIRES_V0_MODELS = [
# V1 Test: no way to fall back for head_dim = 80
# https://github.com/vllm-project/vllm/issues/14524
"qwen_vl",
"h2ovl",
"blip2",
# V1 Test: not enough KV cache space in C1.
"fuyu",
]
# yapf: disable # yapf: disable
COMMON_BROADCAST_SETTINGS = { COMMON_BROADCAST_SETTINGS = {
"test_type": VLMTestType.IMAGE, "test_type": VLMTestType.IMAGE,
...@@ -94,7 +103,7 @@ VLM_TEST_SETTINGS = { ...@@ -94,7 +103,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings, convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
...@@ -114,14 +123,10 @@ VLM_TEST_SETTINGS = { ...@@ -114,14 +123,10 @@ VLM_TEST_SETTINGS = {
"stop_sign": "caption es", "stop_sign": "caption es",
"cherry_blossom": "What is in the picture?", "cherry_blossom": "What is in the picture?",
}), }),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm() dtype="bfloat16",
else ("half", "float")), marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
marks=[pytest.mark.core_model],
), ),
# TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
# once we upgraded to transformers>=4.49.0. # once we upgraded to transformers>=4.49.0.
...@@ -160,30 +165,30 @@ VLM_TEST_SETTINGS = { ...@@ -160,30 +165,30 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( # "aria": VLMTestInfo(
models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")], # models=["rhymes-ai/Aria"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), # test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 # prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n", # img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
max_model_len=4096, # max_model_len=4096,
max_num_seqs=2, # max_num_seqs=2,
single_image_prompts=IMAGE_ASSETS.prompts({ # auto_cls=AutoModelForImageTextToText,
"stop_sign": "<vlm_image>Please describe the image shortly.", # single_image_prompts=IMAGE_ASSETS.prompts({
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # "stop_sign": "<vlm_image>Please describe the image shortly.",
}), # "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501 # }),
postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
stop_str=["<|im_end|>"], # stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)], # image_size_factors=[(0.10, 0.15)],
max_tokens=64, # max_tokens=64,
marks=[large_gpu_mark(min_gb=64)], # marks=[large_gpu_mark(min_gb=64)],
), # ),
"blip2": VLMTestInfo( "blip2": VLMTestInfo(
models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")], models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:", prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output, vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
), ),
"chameleon": VLMTestInfo( "chameleon": VLMTestInfo(
...@@ -192,10 +197,7 @@ VLM_TEST_SETTINGS = { ...@@ -192,10 +197,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
# For chameleon, we only compare the sequences # For chameleon, we only compare the sequences
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2], hf_output_post_proc = lambda hf_output, model: hf_output[:2],
...@@ -215,7 +217,6 @@ VLM_TEST_SETTINGS = { ...@@ -215,7 +217,6 @@ VLM_TEST_SETTINGS = {
}), }),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501 multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
...@@ -233,21 +234,44 @@ VLM_TEST_SETTINGS = { ...@@ -233,21 +234,44 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
use_tokenizer_eos=True, use_tokenizer_eos=True,
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10, num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
), ),
"glm4": VLMTestInfo( "gemma3": VLMTestInfo(
models=[os.path.join(models_path_prefix, "google/gemma-3-4b-it")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501
}),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
),
"glm4v": VLMTestInfo(
models=[os.path.join(models_path_prefix, "THUDM/glm-4v-9b")], models=[os.path.join(models_path_prefix, "THUDM/glm-4v-9b")],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=identity, prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
img_idx_to_prompt=lambda idx: "", single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
}),
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16",
get_stop_token_ids=lambda tok: [151329, 151336, 151338], get_stop_token_ids=lambda tok: [151329, 151336, 151338],
patch_hf_runner=model_utils.glm_patch_hf_runner, patch_hf_runner=model_utils.glm4v_patch_hf_runner,
# The image embeddings match with HF but the outputs of the language
# decoder are only consistent up to 2 decimal places.
# So, we need to reduce the number of tokens for the test to pass.
max_tokens=8,
num_logprobs=10,
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"h2ovl": VLMTestInfo( "h2ovl": VLMTestInfo(
...@@ -263,7 +287,6 @@ VLM_TEST_SETTINGS = { ...@@ -263,7 +287,6 @@ VLM_TEST_SETTINGS = {
}), }),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=8192, max_model_len=8192,
dtype="bfloat16",
use_tokenizer_eos=True, use_tokenizer_eos=True,
num_logprobs=10, num_logprobs=10,
patch_hf_runner=model_utils.h2ovl_patch_hf_runner, patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
...@@ -275,7 +298,7 @@ VLM_TEST_SETTINGS = { ...@@ -275,7 +298,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
hf_output_post_proc=model_utils.idefics3_trunc_hf_output, hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
), ),
"intern_vl": VLMTestInfo( "intern_vl": VLMTestInfo(
...@@ -292,10 +315,6 @@ VLM_TEST_SETTINGS = { ...@@ -292,10 +315,6 @@ VLM_TEST_SETTINGS = {
}), }),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=4096, max_model_len=4096,
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
dtype="bfloat16",
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner, patch_hf_runner=model_utils.internvl_patch_hf_runner,
), ),
...@@ -304,7 +323,7 @@ VLM_TEST_SETTINGS = { ...@@ -304,7 +323,7 @@ VLM_TEST_SETTINGS = {
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
...@@ -319,9 +338,6 @@ VLM_TEST_SETTINGS = { ...@@ -319,9 +338,6 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
num_video_frames=16, num_video_frames=16,
max_model_len=16384, max_model_len=16384,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values_videos"
),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
...@@ -346,11 +362,8 @@ VLM_TEST_SETTINGS = { ...@@ -346,11 +362,8 @@ VLM_TEST_SETTINGS = {
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
max_model_len=4096, max_model_len=4096,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
get_stop_token_ids=lambda tok: [128009], get_stop_token_ids=lambda tok: [128009],
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
patch_hf_runner=model_utils.mantis_patch_hf_runner, patch_hf_runner=model_utils.mantis_patch_hf_runner,
marks=[ marks=[
...@@ -368,8 +381,8 @@ VLM_TEST_SETTINGS = { ...@@ -368,8 +381,8 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id], get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
postprocess_inputs=model_utils.wrap_inputs_post_processor,
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
), ),
"minicpmo_26": VLMTestInfo( "minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"], models=["openbmb/MiniCPM-o-2_6"],
...@@ -379,11 +392,8 @@ VLM_TEST_SETTINGS = { ...@@ -379,11 +392,8 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
postprocess_inputs=model_utils.ignore_inputs_post_processor(
"image_sizes"
),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_patch_hf_runner patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
), ),
"minicpmv_26": VLMTestInfo( "minicpmv_26": VLMTestInfo(
models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")], models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-V-2_6")],
...@@ -393,10 +403,8 @@ VLM_TEST_SETTINGS = { ...@@ -393,10 +403,8 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
postprocess_inputs=model_utils.ignore_inputs_post_processor(
"image_sizes"
),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
), ),
"molmo": VLMTestInfo( "molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"], models=["allenai/Molmo-7B-D-0924"],
...@@ -405,7 +413,6 @@ VLM_TEST_SETTINGS = { ...@@ -405,7 +413,6 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
patch_hf_runner=model_utils.molmo_patch_hf_runner, patch_hf_runner=model_utils.molmo_patch_hf_runner,
postprocess_inputs=model_utils.molmo_post_processor,
), ),
# Tests for phi3v currently live in another file because of a bug in # Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead. # transformers. Once this issue is fixed, we can enable them here instead.
...@@ -431,7 +438,7 @@ VLM_TEST_SETTINGS = { ...@@ -431,7 +438,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "[IMG]", img_idx_to_prompt=lambda idx: "[IMG]",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"qwen_vl": VLMTestInfo( "qwen_vl": VLMTestInfo(
...@@ -449,10 +456,7 @@ VLM_TEST_SETTINGS = { ...@@ -449,10 +456,7 @@ VLM_TEST_SETTINGS = {
models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")], models=[os.path.join(models_path_prefix, "facebook/chameleon-7b")],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2], hf_output_post_proc = lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal, comparator=check_outputs_equal,
...@@ -463,7 +467,7 @@ VLM_TEST_SETTINGS = { ...@@ -463,7 +467,7 @@ VLM_TEST_SETTINGS = {
models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")], models=[os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS # type: ignore
...@@ -472,7 +476,7 @@ VLM_TEST_SETTINGS = { ...@@ -472,7 +476,7 @@ VLM_TEST_SETTINGS = {
models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")], models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS # type: ignore
...@@ -497,9 +501,6 @@ VLM_TEST_SETTINGS = { ...@@ -497,9 +501,6 @@ VLM_TEST_SETTINGS = {
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=16384, max_model_len=16384,
max_num_seqs=2, max_num_seqs=2,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
...@@ -509,6 +510,19 @@ VLM_TEST_SETTINGS = { ...@@ -509,6 +510,19 @@ VLM_TEST_SETTINGS = {
limit_mm_per_prompt={"image": 4}, limit_mm_per_prompt={"image": 4},
)], )],
), ),
# regression test for https://github.com/vllm-project/vllm/issues/15122
"qwen2_5_vl-windows-attention": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
limit_mm_per_prompt={"image": 1},
)],
),
} }
# yapf: enable # yapf: enable
...@@ -560,13 +574,15 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) ...@@ -560,13 +574,15 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
fork_new_process_for_each_test=False, create_new_process_for_each_test=False,
)) ))
def test_single_image_models(tmp_path: PosixPath, model_type: str, def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_single_image_test( runners.run_single_image_test(
tmp_path=tmp_path, tmp_path=tmp_path,
...@@ -583,13 +599,15 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ...@@ -583,13 +599,15 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE, test_type=VLMTestType.MULTI_IMAGE,
fork_new_process_for_each_test=False, create_new_process_for_each_test=False,
)) ))
def test_multi_image_models(tmp_path: PosixPath, model_type: str, def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_multi_image_test( runners.run_multi_image_test(
tmp_path=tmp_path, tmp_path=tmp_path,
...@@ -606,13 +624,15 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ...@@ -606,13 +624,15 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING, test_type=VLMTestType.EMBEDDING,
fork_new_process_for_each_test=False, create_new_process_for_each_test=False,
)) ))
def test_image_embedding_models(model_type: str, def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_embedding_test( runners.run_embedding_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -628,11 +648,13 @@ def test_image_embedding_models(model_type: str, ...@@ -628,11 +648,13 @@ def test_image_embedding_models(model_type: str,
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO, test_type=VLMTestType.VIDEO,
fork_new_process_for_each_test=False, create_new_process_for_each_test=False,
)) ))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets): video_assets: _VideoAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_video_test( runners.run_video_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -648,14 +670,17 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ...@@ -648,14 +670,17 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
fork_new_process_for_each_test=False, create_new_process_for_each_test=False,
)) ))
def test_custom_inputs_models( def test_custom_inputs_models(
model_type: str, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch,
): ):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_custom_inputs_test( runners.run_custom_inputs_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -671,14 +696,16 @@ def test_custom_inputs_models( ...@@ -671,14 +696,16 @@ def test_custom_inputs_models(
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
fork_new_process_for_each_test=True, create_new_process_for_each_test=True,
)) ))
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_single_image_test( runners.run_single_image_test(
tmp_path=tmp_path, tmp_path=tmp_path,
...@@ -695,14 +722,16 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -695,14 +722,16 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE, test_type=VLMTestType.MULTI_IMAGE,
fork_new_process_for_each_test=True, create_new_process_for_each_test=True,
)) ))
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_multi_image_test( runners.run_multi_image_test(
tmp_path=tmp_path, tmp_path=tmp_path,
...@@ -719,14 +748,16 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -719,14 +748,16 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING, test_type=VLMTestType.EMBEDDING,
fork_new_process_for_each_test=True, create_new_process_for_each_test=True,
)) ))
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_image_embedding_models_heavy(model_type: str, def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets): image_assets: _ImageAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_embedding_test( runners.run_embedding_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -742,12 +773,14 @@ def test_image_embedding_models_heavy(model_type: str, ...@@ -742,12 +773,14 @@ def test_image_embedding_models_heavy(model_type: str,
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO, test_type=VLMTestType.VIDEO,
fork_new_process_for_each_test=True, create_new_process_for_each_test=True,
)) ))
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets): video_assets: _VideoAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_video_test( runners.run_video_test(
model_test_info=model_test_info, model_test_info=model_test_info,
...@@ -763,15 +796,18 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ...@@ -763,15 +796,18 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
get_parametrized_options( get_parametrized_options(
VLM_TEST_SETTINGS, VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
fork_new_process_for_each_test=True, create_new_process_for_each_test=True,
)) ))
@fork_new_process_for_each_test @create_new_process_for_each_test()
def test_custom_inputs_models_heavy( def test_custom_inputs_models_heavy(
model_type: str, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
monkeypatch,
): ):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_custom_inputs_test( runners.run_custom_inputs_test(
model_test_info=model_test_info, model_test_info=model_test_info,
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import os import os
import re import re
from typing import List, Optional, Tuple, Type from typing import Optional
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
...@@ -26,7 +26,7 @@ HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these ...@@ -26,7 +26,7 @@ HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these
models = [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")] models = [os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def vllm_to_hf_output(vllm_output: tuple[list[int], str,
Optional[SampleLogprobs]], Optional[SampleLogprobs]],
model: str): model: str):
"""Sanitize vllm output to be comparable with hf output.""" """Sanitize vllm output to be comparable with hf output."""
...@@ -56,9 +56,9 @@ if current_platform.is_rocm(): ...@@ -56,9 +56,9 @@ if current_platform.is_rocm():
def run_test( def run_test(
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
inputs: List[Tuple[List[str], PromptImageInput]], inputs: list[tuple[list[str], PromptImageInput]],
model: str, model: str,
*, *,
dtype: str, dtype: str,
......
# SPDX-License-Identifier: Apache-2.0
import os
import re
from typing import Optional
import pytest
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
models = [model_path]
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
Optional[SampleLogprobs]],
model: str):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
assert output_str_without_image[0] == " "
output_str_without_image = output_str_without_image[1:]
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
tokenizer = AutoTokenizer.from_pretrained(model)
hf_output_ids = tokenizer.encode(output_str_without_image)
assert hf_output_ids[0] == 1
hf_output_ids = hf_output_ids[1:]
return hf_output_ids, hf_output_str, out_logprobs
target_dtype = "half"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
# FIXME (mattwong, gshtrasb, hongxiayan)
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: list[tuple[list[str], PromptImageInput]],
model: str,
*,
max_model_len: int,
dtype: str,
max_tokens: int,
num_logprobs: int,
mm_limit: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
task="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs
]
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
hf_model_kwargs = {"_attn_implementation": "eager"}
with hf_runner(model, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
eos_token_id = hf_model.processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
eos_token_id=eos_token_id,
num_logits_to_keep=0)
for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
# Since we use _attn_implementation="eager" for hf_runner, there is more
# significant numerical difference. The basic `logprobs=5` fails to pass.
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.7, 0.75, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [4096])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
run_test(
hf_runner,
vllm_runner,
inputs_per_image,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=1,
tensor_parallel_size=1,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
# [],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_model_len", [10000])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.xfail(
reason="Phi-4-MM multi-image inference is divergent with hf model.")
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors])
]
run_test(
hf_runner,
vllm_runner,
inputs_per_case,
model,
dtype=dtype,
max_model_len=max_model_len,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
mm_limit=2,
tensor_parallel_size=1,
)
...@@ -4,9 +4,8 @@ ...@@ -4,9 +4,8 @@
Run `pytest tests/models/test_mistral.py`. Run `pytest tests/models/test_mistral.py`.
""" """
import json import json
import uuid
from dataclasses import asdict from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple from typing import TYPE_CHECKING, Any, Optional
import os import os
import pytest import pytest
...@@ -17,8 +16,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer ...@@ -17,8 +16,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams, from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
TextPrompt, TokensPrompt)
from vllm.multimodal import MultiModalDataBuiltins from vllm.multimodal import MultiModalDataBuiltins
from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.inputs import PlaceholderRange
from vllm.sequence import Logprob, SampleLogprobs from vllm.sequence import Logprob, SampleLogprobs
...@@ -30,8 +28,11 @@ from ....utils import models_path_prefix ...@@ -30,8 +28,11 @@ from ....utils import models_path_prefix
if TYPE_CHECKING: if TYPE_CHECKING:
from _typeshed import StrPath from _typeshed import StrPath
MODELS = [os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409")] PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
#todo MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS = [os.path.join(models_path_prefix, PIXTRAL_ID), os.path.join(models_path_prefix, MISTRAL_SMALL_3_1_ID)]
IMG_URLS = [ IMG_URLS = [
"https://picsum.photos/id/237/400/300", "https://picsum.photos/id/237/400/300",
"https://picsum.photos/id/231/200/300", "https://picsum.photos/id/231/200/300",
...@@ -41,7 +42,7 @@ IMG_URLS = [ ...@@ -41,7 +42,7 @@ IMG_URLS = [
PROMPT = "Describe each image in one short sentence." PROMPT = "Describe each image in one short sentence."
def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]: def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
return [{ return [{
"role": "role":
"user", "user",
...@@ -57,7 +58,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]: ...@@ -57,7 +58,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
}] }]
def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]: def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
return [{ return [{
"role": "role":
"user", "user",
...@@ -71,7 +72,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]: ...@@ -71,7 +72,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
}] }]
def _create_engine_inputs(urls: List[str]) -> TokensPrompt: def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
msg = _create_msg_format(urls) msg = _create_msg_format(urls)
tokenizer = MistralTokenizer.from_model("pixtral") tokenizer = MistralTokenizer.from_model("pixtral")
...@@ -92,7 +93,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt: ...@@ -92,7 +93,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
return engine_inputs return engine_inputs
def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt: def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
msg = _create_msg_format_hf(urls) msg = _create_msg_format_hf(urls)
tokenizer = AutoProcessor.from_pretrained(os.path.join(models_path_prefix, "mistral-community/pixtral-12b")) tokenizer = AutoProcessor.from_pretrained(os.path.join(models_path_prefix, "mistral-community/pixtral-12b"))
...@@ -128,10 +129,12 @@ MAX_MODEL_LEN = [8192, 65536] ...@@ -128,10 +129,12 @@ MAX_MODEL_LEN = [8192, 65536]
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures" FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists() assert FIXTURES_PATH.exists()
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json" FIXTURE_LOGPROBS_CHAT = {
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json" PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]] OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
# For the test author to store golden output in JSON # For the test author to store golden output in JSON
...@@ -169,12 +172,12 @@ def test_chat( ...@@ -169,12 +172,12 @@ def test_chat(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT) EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
tokenizer_mode="mistral", tokenizer_mode="mistral",
enable_chunked_prefill=False,
max_model_len=max_model_len, max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model: ) as vllm_model:
...@@ -186,70 +189,40 @@ def test_chat( ...@@ -186,70 +189,40 @@ def test_chat(
outputs.extend(output) outputs.extend(output)
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs) logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
# Remove last `None` prompt_logprobs to compare with fixture
for i in range(len(logprobs)):
assert logprobs[i][-1] is None
logprobs[i] = logprobs[i][:-1]
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS, check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs, outputs_1_lst=logprobs,
name_0="h100_ref", name_0="h100_ref",
name_1="output") name_1="output")
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
args = EngineArgs(
model=model,
tokenizer_mode="mistral",
enable_chunked_prefill=False,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
dtype=dtype,
)
engine = LLMEngine.from_engine_args(args)
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
outputs = []
count = 0
while True:
out = engine.step()
count += 1
for request_output in out:
if request_output.finished:
outputs.append(request_output)
if count == 2:
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
SAMPLING_PARAMS)
if not engine.has_unfinished_requests():
break
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output")
@large_gpu_test(min_gb=48) @large_gpu_test(min_gb=48)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"prompt,expected_ranges", "prompt,expected_ranges",
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{ [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
"offset": 10, "offset": 11,
"length": 494 "length": 494
}]), }]),
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{ (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
"offset": 10, "offset": 11,
"length": 266 "length": 266
}, { }, {
"offset": 276, "offset": 277,
"length": 1056 "length": 1056
}, { }, {
"offset": 1332, "offset": 1333,
"length": 418 "length": 418
}])]) }])])
def test_multi_modal_placeholders( def test_multi_modal_placeholders(vllm_runner, prompt,
vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None: expected_ranges: list[PlaceholderRange],
monkeypatch) -> None:
# This placeholder checking test only works with V0 engine
# where `multi_modal_placeholders` is returned with `RequestOutput`
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner( with vllm_runner(
os.path.join(models_path_prefix, "mistral-community/pixtral-12b"), os.path.join(models_path_prefix, "mistral-community/pixtral-12b"),
max_model_len=8192, max_model_len=8192,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Any, List, Optional, Tuple, Type, TypedDict, Union from typing import Any, Optional, TypedDict, Union
import os import os
import numpy.typing as npt import numpy.typing as npt
...@@ -16,6 +16,15 @@ from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, ...@@ -16,6 +16,15 @@ from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix from ....utils import models_path_prefix
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
models = [os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")] models = [os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")]
target_dtype = "half" target_dtype = "half"
...@@ -71,21 +80,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): ...@@ -71,21 +80,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
def batch_make_image_embeddings( def batch_make_image_embeddings(
image_batches: List[Union[Image.Image, List[Image.Image]]], processor, image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
"""batched image embeddings for Qwen2-VL """batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch, This will infer all images' embeddings in a single batch,
and split the result according to input batches. and split the result according to input batches.
image_batches: image_batches:
- Single-image batches: `List[Image.Image]` - Single-image batches: `list[Image.Image]`
- Multiple-image batches: `List[List[Image.Image]]]` - Multiple-image batches: `list[list[Image.Image]]]`
returns: `List[Qwen2VLPromptImageEmbeddingInput]` returns: `list[Qwen2VLPromptImageEmbeddingInput]`
""" """
image_batches_: List[Any] = image_batches[:] image_batches_: list[Any] = image_batches[:]
# convert single-image batches to multiple-image batches # convert single-image batches to multiple-image batches
for idx in range(len(image_batches_)): for idx in range(len(image_batches_)):
...@@ -95,7 +104,7 @@ def batch_make_image_embeddings( ...@@ -95,7 +104,7 @@ def batch_make_image_embeddings(
assert isinstance(image_batches_[idx], list) assert isinstance(image_batches_[idx], list)
# append all images into a list (as a batch) # append all images into a list (as a batch)
images: List[Image.Image] = [] images: list[Image.Image] = []
for image_batch in image_batches_: for image_batch in image_batches_:
images += image_batch images += image_batch
...@@ -120,10 +129,11 @@ def batch_make_image_embeddings( ...@@ -120,10 +129,11 @@ def batch_make_image_embeddings(
return visual(pixel_values_on_device, return visual(pixel_values_on_device,
grid_thw=image_grid_thw_on_device) grid_thw=image_grid_thw_on_device)
# V1 Test: this calls a V0 internal.
image_embeds = torch.concat(llm.apply_model(get_image_embeds)) image_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches # split into original batches
result: List[Qwen2VLPromptImageEmbeddingInput] = [] result: list[Qwen2VLPromptImageEmbeddingInput] = []
image_counter = 0 image_counter = 0
embed_counter = 0 embed_counter = 0
for image_batch in image_batches_: for image_batch in image_batches_:
...@@ -155,7 +165,7 @@ def batch_make_image_embeddings( ...@@ -155,7 +165,7 @@ def batch_make_image_embeddings(
def batch_make_video_embeddings( def batch_make_video_embeddings(
video_batches: PromptVideoInput, processor, video_batches: PromptVideoInput, processor,
llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
"""batched video embeddings for Qwen2-VL """batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames. A NDArray represents a single video's all frames.
...@@ -164,21 +174,21 @@ def batch_make_video_embeddings( ...@@ -164,21 +174,21 @@ def batch_make_video_embeddings(
and split the result according to input batches. and split the result according to input batches.
video_batches: video_batches:
- Single-video batches: `List[NDArray]` - Single-video batches: `list[NDArray]`
- Multiple-video batches: `List[List[NDArray]]` - Multiple-video batches: `list[list[NDArray]]`
""" """
video_batches_: List[Any] = video_batches[:] video_batches_: list[Any] = video_batches[:]
for idx in range(len(video_batches_)): for idx in range(len(video_batches_)):
if not isinstance(video_batches_[idx], list): if not isinstance(video_batches_[idx], list):
single_video_batch: List[npt.NDArray] = [video_batches_[idx]] single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
video_batches_[idx] = single_video_batch video_batches_[idx] = single_video_batch
assert isinstance(video_batches_[idx], list) assert isinstance(video_batches_[idx], list)
# append all videos into a list (as a batch) # append all videos into a list (as a batch)
videos: List[npt.NDArray] = [] videos: list[npt.NDArray] = []
for video_batch in video_batches_: for video_batch in video_batches_:
videos += video_batch videos += video_batch
...@@ -203,10 +213,11 @@ def batch_make_video_embeddings( ...@@ -203,10 +213,11 @@ def batch_make_video_embeddings(
return visual(pixel_values_on_device, return visual(pixel_values_on_device,
grid_thw=video_grid_thw_on_device) grid_thw=video_grid_thw_on_device)
# V1 Test: this calls a V0 internal.
video_embeds = torch.concat(llm.apply_model(get_image_embeds)) video_embeds = torch.concat(llm.apply_model(get_image_embeds))
# split into original batches # split into original batches
result: List[Qwen2VLPromptVideoEmbeddingInput] = [] result: list[Qwen2VLPromptVideoEmbeddingInput] = []
video_counter = 0 video_counter = 0
embed_counter = 0 embed_counter = 0
for video_batch in video_batches_: for video_batch in video_batches_:
...@@ -237,8 +248,8 @@ def batch_make_video_embeddings( ...@@ -237,8 +248,8 @@ def batch_make_video_embeddings(
def run_embedding_input_test( def run_embedding_input_test(
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
model: str, model: str,
*, *,
dtype: str, dtype: str,
...@@ -255,7 +266,6 @@ def run_embedding_input_test( ...@@ -255,7 +266,6 @@ def run_embedding_input_test(
processor = AutoProcessor.from_pretrained(model) processor = AutoProcessor.from_pretrained(model)
# NOTE:
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
task="generate", task="generate",
...@@ -325,8 +335,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, ...@@ -325,8 +335,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
num_logprobs: int) -> None: num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
inputs_per_case: List[Tuple[ inputs_per_case: list[tuple[
List[str], PromptImageInput, PromptVideoInput]] = [( list[str], PromptImageInput, PromptVideoInput]] = [(
[prompt for _ in size_factors], [prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors], [rescale_image_size(image, factor) for factor in size_factors],
[], [],
...@@ -367,7 +377,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets, ...@@ -367,7 +377,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
num_logprobs: int) -> None: num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
inputs_per_case: List[Tuple[List[str], PromptImageInput, inputs_per_case: list[tuple[list[str], PromptImageInput,
PromptVideoInput]] = [( PromptVideoInput]] = [(
[MULTIIMAGE_PROMPT for _ in size_factors], [MULTIIMAGE_PROMPT for _ in size_factors],
[[ [[
...@@ -415,8 +425,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, ...@@ -415,8 +425,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
for asset in video_assets for asset in video_assets
] ]
inputs_per_case: List[Tuple[ inputs_per_case: list[tuple[
List[str], PromptImageInput, PromptVideoInput]] = [( list[str], PromptImageInput, PromptVideoInput]] = [(
[prompt for _ in size_factors], [prompt for _ in size_factors],
[], [],
[rescale_video_size(video, factor) for factor in size_factors], [rescale_video_size(video, factor) for factor in size_factors],
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Helpers for building inputs that can be leveraged for different test types. """Helpers for building inputs that can be leveraged for different test types.
""" """
from collections.abc import Iterable
from pathlib import PosixPath from pathlib import PosixPath
from typing import Callable, Iterable, List, Optional, Tuple, Union from typing import Callable, Optional, Union
import torch import torch
...@@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], ...@@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
def get_model_prompts(base_prompts: Iterable[str], def get_model_prompts(base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]], img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]], video_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str]) -> List[str]: prompt_formatter: Callable[[str], str]) -> list[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s) """Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model. to get the test prompt string for this model.
...@@ -218,7 +219,7 @@ def build_video_inputs_from_test_info( ...@@ -218,7 +219,7 @@ def build_video_inputs_from_test_info(
) for video, prompt in zip(sampled_vids, model_prompts)] ) for video, prompt in zip(sampled_vids, model_prompts)]
def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]], def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
size_type: SizeType): size_type: SizeType):
"""Applies a size scaler to one image; this can be a an image size factor, """Applies a size scaler to one image; this can be a an image size factor,
which scales the image while maintaining the aspect ratio""" which scales the image while maintaining the aspect ratio"""
......
...@@ -5,7 +5,7 @@ handling multimodal placeholder substitution, and so on. ...@@ -5,7 +5,7 @@ handling multimodal placeholder substitution, and so on.
""" """
import itertools import itertools
from collections import OrderedDict from collections import OrderedDict
from typing import Dict, Iterable, Tuple from collections.abc import Iterable
import pytest import pytest
...@@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs, ...@@ -13,9 +13,9 @@ from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType) ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo], def get_filtered_test_settings(
test_type: VLMTestType, test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
fork_per_test: bool) -> Dict[str, VLMTestInfo]: new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
"""Given the dict of potential test settings to run, return a subdict """Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for of tests who have the current test type enabled with the matching val for
fork_per_test. fork_per_test.
...@@ -43,22 +43,22 @@ def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo], ...@@ -43,22 +43,22 @@ def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
# Everything looks okay; keep if this is has correct proc handling # Everything looks okay; keep if this is has correct proc handling
if (test_info.distributed_executor_backend if (test_info.distributed_executor_backend
is not None) == fork_per_test: is not None) == new_proc_per_test:
matching_tests[test_name] = test_info matching_tests[test_name] = test_info
return matching_tests return matching_tests
def get_parametrized_options(test_settings: Dict[str, VLMTestInfo], def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType, test_type: VLMTestType,
fork_new_process_for_each_test: bool): create_new_process_for_each_test: bool):
"""Converts all of our VLMTestInfo into an expanded list of parameters. """Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases. size factors etc, while still running in isolated test cases.
""" """
matching_tests = get_filtered_test_settings( matching_tests = get_filtered_test_settings(
test_settings, test_type, fork_new_process_for_each_test) test_settings, test_type, create_new_process_for_each_test)
# Ensure that something is wrapped as an iterable it's not already # Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, ) ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
...@@ -121,7 +121,7 @@ def get_parametrized_options(test_settings: Dict[str, VLMTestInfo], ...@@ -121,7 +121,7 @@ def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
def get_wrapped_test_sizes( def get_wrapped_test_sizes(
test_info: VLMTestInfo, test_info: VLMTestInfo,
test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]: test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
"""Given a test info which may have size factors or fixed sizes, wrap them """Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter and combine them into an iterable, each of which will be used in parameter
expansion. expansion.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Core test implementation to be shared across modalities.""" """Core test implementation to be shared across modalities."""
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union from typing import Any, Callable, Optional, Union
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from transformers import BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import TaskOption
...@@ -17,9 +16,9 @@ from .types import RunnerOutput ...@@ -17,9 +16,9 @@ from .types import RunnerOutput
def run_test( def run_test(
*, *,
hf_runner: Type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: Type[VllmRunner], vllm_runner: type[VllmRunner],
inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]], inputs: list[tuple[list[str], list[Union[list[Image], Image]]]],
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
...@@ -29,15 +28,14 @@ def run_test( ...@@ -29,15 +28,14 @@ def run_test(
max_num_seqs: int, max_num_seqs: int,
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
auto_cls: Type[_BaseAutoModelClass], auto_cls: type[_BaseAutoModelClass],
use_tokenizer_eos: bool, use_tokenizer_eos: bool,
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
comparator: Callable[..., None], comparator: Callable[..., None],
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]], get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
stop_str: Optional[List[str]], stop_str: Optional[list[str]],
limit_mm_per_prompt: Dict[str, int], limit_mm_per_prompt: dict[str, int],
vllm_runner_kwargs: Optional[Dict[str, Any]], vllm_runner_kwargs: Optional[dict[str, Any]],
hf_model_kwargs: Optional[Dict[str, Any]], hf_model_kwargs: Optional[dict[str, Any]],
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
task: TaskOption = "auto", task: TaskOption = "auto",
runner_mm_key: str = "images", runner_mm_key: str = "images",
...@@ -61,7 +59,9 @@ def run_test( ...@@ -61,7 +59,9 @@ def run_test(
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method (the default method).
vllm_runner_kwargs_: Dict[str, Any] = {} vllm_runner_kwargs_: dict[str, Any] = {
"disable_mm_preprocessor_cache": True,
}
if model_info.tokenizer: if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
if model_info.tokenizer_mode: if model_info.tokenizer_mode:
...@@ -84,7 +84,7 @@ def run_test( ...@@ -84,7 +84,7 @@ def run_test(
**vllm_runner_kwargs_) as vllm_model: **vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.model.get_tokenizer()
vllm_kwargs: Dict[str, Any] = {} vllm_kwargs: dict[str, Any] = {}
if get_stop_token_ids is not None: if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer) vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str: if stop_str:
...@@ -99,7 +99,6 @@ def run_test( ...@@ -99,7 +99,6 @@ def run_test(
hf_model = hf_runner(model, hf_model = hf_runner(model,
dtype=dtype, dtype=dtype,
auto_cls=auto_cls, auto_cls=auto_cls,
postprocess_inputs=postprocess_inputs,
model_kwargs=hf_model_kwargs) model_kwargs=hf_model_kwargs)
# Some models need to patch things like the model processor, e.g., internvl # Some models need to patch things like the model processor, e.g., internvl
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Custom input builders for edge-cases in different models.""" """Custom input builders for edge-cases in different models."""
from io import BytesIO
from typing import Callable from typing import Callable
import requests
from PIL import Image
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video, from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video) sample_frames_from_video)
...@@ -102,3 +106,17 @@ def different_patch_input_cases_internvl(): ...@@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
build_single_image_inputs(images, formatted_sprompts, wrapped_sf), build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf), build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
] ]
def windows_attention_image_qwen2_5_vl():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
image = Image.open(BytesIO(requests.get(image_url).content))
question = "Describe the image."
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment