Commit ca796e19 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.1' into v0.8.1-ori

parents e983c804 61c7a1b8
...@@ -34,7 +34,7 @@ def phi3v_model_config(): ...@@ -34,7 +34,7 @@ def phi3v_model_config():
tokenizer=PHI3V_MODEL_ID, tokenizer=PHI3V_MODEL_ID,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="auto",
seed=0, seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
...@@ -58,7 +58,7 @@ def mllama_model_config(): ...@@ -58,7 +58,7 @@ def mllama_model_config():
tokenizer=MLLAMA_MODEL_ID, tokenizer=MLLAMA_MODEL_ID,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="auto",
seed=0, seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
...@@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): ...@@ -669,7 +669,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
tokenizer=MLLAMA_MODEL_ID, tokenizer=MLLAMA_MODEL_ID,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="auto",
seed=0, seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
......
...@@ -5,11 +5,10 @@ from typing import Optional ...@@ -5,11 +5,10 @@ from typing import Optional
import numpy as np import numpy as np
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from transformers import AutoModel, AutoTokenizer, BatchEncoding from transformers import AutoModel, AutoTokenizer
from vllm.multimodal.audio import resample_audio from vllm.multimodal.audio import resample_audio
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import HfRunner, VllmRunner from ....conftest import HfRunner, VllmRunner
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
...@@ -107,8 +106,6 @@ def run_test( ...@@ -107,8 +106,6 @@ def run_test(
**kwargs, **kwargs,
): ):
"""Inference result should be the same between hf and vllm.""" """Inference result should be the same between hf and vllm."""
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
# NOTE: take care of the order. run vLLM first, and then run HF. # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
...@@ -124,15 +121,7 @@ def run_test( ...@@ -124,15 +121,7 @@ def run_test(
for vllm_prompt, _, audio in prompts_and_audios for vllm_prompt, _, audio in prompts_and_audios
] ]
def process(hf_inputs: BatchEncoding, **kwargs): with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_inputs["audio_values"] = hf_inputs["audio_values"] \
.to(torch_dtype) # type: ignore
return hf_inputs
with hf_runner(model,
dtype=dtype,
postprocess_inputs=process,
auto_cls=AutoModel) as hf_model:
hf_outputs_per_audio = [ hf_outputs_per_audio = [
hf_model.generate_greedy_logprobs_limit( hf_model.generate_greedy_logprobs_limit(
[hf_prompt], [hf_prompt],
......
...@@ -9,7 +9,7 @@ from pathlib import PosixPath ...@@ -9,7 +9,7 @@ from pathlib import PosixPath
import pytest import pytest
from packaging.version import Version from packaging.version import Version
from transformers import AutoModelForPreTraining, AutoModelForVision2Seq from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
from transformers import __version__ as TRANSFORMERS_VERSION from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = { ...@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings, convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
...@@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = { ...@@ -121,10 +121,7 @@ VLM_TEST_SETTINGS = {
"stop_sign": "caption es", "stop_sign": "caption es",
"cherry_blossom": "What is in the picture?", "cherry_blossom": "What is in the picture?",
}), }),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="bfloat16", dtype="bfloat16",
marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501 marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")], # noqa: E501
...@@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = { ...@@ -179,7 +176,6 @@ VLM_TEST_SETTINGS = {
# "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501 # "cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501
# }), # }),
# multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501 # multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501
# postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
# stop_str=["<|im_end|>"], # stop_str=["<|im_end|>"],
# image_size_factors=[(0.10, 0.15)], # image_size_factors=[(0.10, 0.15)],
# max_tokens=64, # max_tokens=64,
...@@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = { ...@@ -190,7 +186,7 @@ VLM_TEST_SETTINGS = {
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:", prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output, vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
), ),
"chameleon": VLMTestInfo( "chameleon": VLMTestInfo(
...@@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = { ...@@ -199,10 +195,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
# For chameleon, we only compare the sequences # For chameleon, we only compare the sequences
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2], hf_output_post_proc = lambda hf_output, model: hf_output[:2],
...@@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = { ...@@ -222,7 +215,6 @@ VLM_TEST_SETTINGS = {
}), }),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501 multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
...@@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = { ...@@ -240,6 +232,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
use_tokenizer_eos=True, use_tokenizer_eos=True,
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10, num_logprobs=10,
...@@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = { ...@@ -256,9 +249,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
# TODO: Use AutoModelForVision2Seq once transformers supports this auto_cls=AutoModelForImageTextToText,
auto_cls=AutoModelForPreTraining,
dtype="bfloat16",
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
), ),
...@@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = { ...@@ -272,7 +263,6 @@ VLM_TEST_SETTINGS = {
}), }),
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16",
get_stop_token_ids=lambda tok: [151329, 151336, 151338], get_stop_token_ids=lambda tok: [151329, 151336, 151338],
patch_hf_runner=model_utils.glm4v_patch_hf_runner, patch_hf_runner=model_utils.glm4v_patch_hf_runner,
# The image embeddings match with HF but the outputs of the language # The image embeddings match with HF but the outputs of the language
...@@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = { ...@@ -295,7 +285,6 @@ VLM_TEST_SETTINGS = {
}), }),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=8192, max_model_len=8192,
dtype="bfloat16",
use_tokenizer_eos=True, use_tokenizer_eos=True,
num_logprobs=10, num_logprobs=10,
patch_hf_runner=model_utils.h2ovl_patch_hf_runner, patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
...@@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = { ...@@ -307,7 +296,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
hf_output_post_proc=model_utils.idefics3_trunc_hf_output, hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
), ),
"intern_vl": VLMTestInfo( "intern_vl": VLMTestInfo(
...@@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = { ...@@ -324,10 +313,6 @@ VLM_TEST_SETTINGS = {
}), }),
multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.", # noqa: E501
max_model_len=4096, max_model_len=4096,
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
dtype="bfloat16",
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner, patch_hf_runner=model_utils.internvl_patch_hf_runner,
), ),
...@@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = { ...@@ -336,7 +321,7 @@ VLM_TEST_SETTINGS = {
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
...@@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = { ...@@ -351,9 +336,6 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
num_video_frames=16, num_video_frames=16,
max_model_len=16384, max_model_len=16384,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values_videos"
),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
...@@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = { ...@@ -378,11 +360,8 @@ VLM_TEST_SETTINGS = {
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
max_model_len=4096, max_model_len=4096,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
get_stop_token_ids=lambda tok: [128009], get_stop_token_ids=lambda tok: [128009],
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
patch_hf_runner=model_utils.mantis_patch_hf_runner, patch_hf_runner=model_utils.mantis_patch_hf_runner,
marks=[ marks=[
...@@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = { ...@@ -400,8 +379,8 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id], get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
postprocess_inputs=model_utils.wrap_inputs_post_processor,
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
), ),
"minicpmo_26": VLMTestInfo( "minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"], models=["openbmb/MiniCPM-o-2_6"],
...@@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = { ...@@ -411,11 +390,8 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
postprocess_inputs=model_utils.ignore_inputs_post_processor(
"image_sizes"
),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_patch_hf_runner patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
), ),
"minicpmv_26": VLMTestInfo( "minicpmv_26": VLMTestInfo(
models=["openbmb/MiniCPM-V-2_6"], models=["openbmb/MiniCPM-V-2_6"],
...@@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = { ...@@ -425,10 +401,8 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
postprocess_inputs=model_utils.ignore_inputs_post_processor(
"image_sizes"
),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
), ),
"molmo": VLMTestInfo( "molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"], models=["allenai/Molmo-7B-D-0924"],
...@@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = { ...@@ -437,7 +411,6 @@ VLM_TEST_SETTINGS = {
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
patch_hf_runner=model_utils.molmo_patch_hf_runner, patch_hf_runner=model_utils.molmo_patch_hf_runner,
postprocess_inputs=model_utils.molmo_post_processor,
), ),
# Tests for phi3v currently live in another file because of a bug in # Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead. # transformers. Once this issue is fixed, we can enable them here instead.
...@@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = { ...@@ -463,7 +436,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "[IMG]", img_idx_to_prompt=lambda idx: "[IMG]",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"qwen_vl": VLMTestInfo( "qwen_vl": VLMTestInfo(
...@@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = { ...@@ -481,10 +454,7 @@ VLM_TEST_SETTINGS = {
models=["facebook/chameleon-7b"], models=["facebook/chameleon-7b"],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
hf_output_post_proc = lambda hf_output, model: hf_output[:2], hf_output_post_proc = lambda hf_output, model: hf_output[:2],
comparator=check_outputs_equal, comparator=check_outputs_equal,
...@@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = { ...@@ -495,7 +465,7 @@ VLM_TEST_SETTINGS = {
models=["llava-hf/llava-1.5-7b-hf"], models=["llava-hf/llava-1.5-7b-hf"],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS # type: ignore
...@@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = { ...@@ -504,7 +474,7 @@ VLM_TEST_SETTINGS = {
models=["llava-hf/llava-v1.6-mistral-7b-hf"], models=["llava-hf/llava-v1.6-mistral-7b-hf"],
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS # type: ignore
...@@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = { ...@@ -529,9 +499,6 @@ VLM_TEST_SETTINGS = {
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=16384, max_model_len=16384,
max_num_seqs=2, max_num_seqs=2,
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
Run `pytest tests/models/test_mistral.py`. Run `pytest tests/models/test_mistral.py`.
""" """
import json import json
import uuid
from dataclasses import asdict from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Optional from typing import TYPE_CHECKING, Any, Optional
...@@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer ...@@ -16,8 +15,7 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams, from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
TextPrompt, TokensPrompt)
from vllm.multimodal import MultiModalDataBuiltins from vllm.multimodal import MultiModalDataBuiltins
from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.inputs import PlaceholderRange
from vllm.sequence import Logprob, SampleLogprobs from vllm.sequence import Logprob, SampleLogprobs
...@@ -28,7 +26,11 @@ from ...utils import check_logprobs_close ...@@ -28,7 +26,11 @@ from ...utils import check_logprobs_close
if TYPE_CHECKING: if TYPE_CHECKING:
from _typeshed import StrPath from _typeshed import StrPath
MODELS = ["mistralai/Pixtral-12B-2409"] PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
IMG_URLS = [ IMG_URLS = [
"https://picsum.photos/id/237/400/300", "https://picsum.photos/id/237/400/300",
"https://picsum.photos/id/231/200/300", "https://picsum.photos/id/231/200/300",
...@@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536] ...@@ -125,8 +127,10 @@ MAX_MODEL_LEN = [8192, 65536]
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures" FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
assert FIXTURES_PATH.exists() assert FIXTURES_PATH.exists()
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json" FIXTURE_LOGPROBS_CHAT = {
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json" PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
}
OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]] OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
...@@ -166,12 +170,12 @@ def test_chat( ...@@ -166,12 +170,12 @@ def test_chat(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT) EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner( with vllm_runner(
model, model,
dtype=dtype, dtype=dtype,
tokenizer_mode="mistral", tokenizer_mode="mistral",
enable_chunked_prefill=False,
max_model_len=max_model_len, max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model: ) as vllm_model:
...@@ -183,70 +187,40 @@ def test_chat( ...@@ -183,70 +187,40 @@ def test_chat(
outputs.extend(output) outputs.extend(output)
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs) logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
# Remove last `None` prompt_logprobs to compare with fixture
for i in range(len(logprobs)):
assert logprobs[i][-1] is None
logprobs[i] = logprobs[i][:-1]
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS, check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs, outputs_1_lst=logprobs,
name_0="h100_ref", name_0="h100_ref",
name_1="output") name_1="output")
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
args = EngineArgs(
model=model,
tokenizer_mode="mistral",
enable_chunked_prefill=False,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
dtype=dtype,
)
engine = LLMEngine.from_engine_args(args)
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
outputs = []
count = 0
while True:
out = engine.step()
count += 1
for request_output in out:
if request_output.finished:
outputs.append(request_output)
if count == 2:
engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
SAMPLING_PARAMS)
if not engine.has_unfinished_requests():
break
logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output")
@large_gpu_test(min_gb=48) @large_gpu_test(min_gb=48)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"prompt,expected_ranges", "prompt,expected_ranges",
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{ [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
"offset": 10, "offset": 11,
"length": 494 "length": 494
}]), }]),
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{ (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
"offset": 10, "offset": 11,
"length": 266 "length": 266
}, { }, {
"offset": 276, "offset": 277,
"length": 1056 "length": 1056
}, { }, {
"offset": 1332, "offset": 1333,
"length": 418 "length": 418
}])]) }])])
def test_multi_modal_placeholders( def test_multi_modal_placeholders(vllm_runner, prompt,
vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None: expected_ranges: list[PlaceholderRange],
monkeypatch) -> None:
# This placeholder checking test only works with V0 engine
# where `multi_modal_placeholders` is returned with `RequestOutput`
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner( with vllm_runner(
"mistral-community/pixtral-12b", "mistral-community/pixtral-12b",
max_model_len=8192, max_model_len=8192,
......
...@@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union ...@@ -4,7 +4,6 @@ from typing import Any, Callable, Optional, Union
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from transformers import BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import TaskOption
...@@ -31,7 +30,6 @@ def run_test( ...@@ -31,7 +30,6 @@ def run_test(
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
auto_cls: type[_BaseAutoModelClass], auto_cls: type[_BaseAutoModelClass],
use_tokenizer_eos: bool, use_tokenizer_eos: bool,
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
comparator: Callable[..., None], comparator: Callable[..., None],
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]], get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
stop_str: Optional[list[str]], stop_str: Optional[list[str]],
...@@ -101,7 +99,6 @@ def run_test( ...@@ -101,7 +99,6 @@ def run_test(
hf_model = hf_runner(model, hf_model = hf_runner(model,
dtype=dtype, dtype=dtype,
auto_cls=auto_cls, auto_cls=auto_cls,
postprocess_inputs=postprocess_inputs,
model_kwargs=hf_model_kwargs) model_kwargs=hf_model_kwargs)
# Some models need to patch things like the model processor, e.g., internvl # Some models need to patch things like the model processor, e.g., internvl
......
...@@ -6,16 +6,15 @@ typically specific to a small subset of models. ...@@ -6,16 +6,15 @@ typically specific to a small subset of models.
import re import re
import types import types
from pathlib import PosixPath from pathlib import PosixPath
from typing import Callable, Optional, Union from typing import Optional, Union
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from transformers import (AutoConfig, AutoTokenizer, BatchEncoding, from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
GenerationConfig) GenerationConfig)
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.transformers_utils.tokenizer import patch_padding_side
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from .....conftest import HfRunner, ImageAsset, _ImageAssets from .....conftest import HfRunner, ImageAsset, _ImageAssets
from .types import RunnerOutput from .types import RunnerOutput
...@@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets): ...@@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
return [asset.image_embeds for asset in image_assets] return [asset.image_embeds for asset in image_assets]
####### postprocessors to run on HF BatchEncoding
def cast_dtype_post_processor(
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
"""Gets a handle to a post processor which converts a given key into a
target data type."""
def process(hf_inputs: BatchEncoding, dtype: str):
torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
return hf_inputs
return process
def ignore_inputs_post_processor(
hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
"""Gets a handle to a post processor which ignores a given key."""
def process(hf_inputs: BatchEncoding, dtype: str):
del hf_inputs[hf_inp_key]
return hf_inputs
return process
def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
return {"model_inputs": hf_inputs}
def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
####### Prompt path encoders for models that need models on disk ####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder( def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
...@@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -295,8 +260,7 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
for k in inputs.keys() # noqa for k in inputs.keys() # noqa
if k not in ("seq_lens", "sft_format") if k not in ("seq_lens", "sft_format")
} }
inputs = BatchEncoding(data=inputs, tensor_type="pt") return BatchFeature(data=inputs, tensor_type="pt")
return inputs
hf_model.processor = processor hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \ hf_model.model.get_output_embeddings = lambda: \
...@@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -529,10 +493,52 @@ def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model return hf_model
def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
orig_generate = hf_model.model.generate orig_generate = hf_model.model.generate
def _generate(self, *args, **kwargs): def _generate(
self,
*args,
input_ids=None,
pixel_values=None,
image_sizes=None,
image_bound=None,
tgt_sizes=None,
**kwargs,
):
model_inputs = {
"input_ids": input_ids,
"pixel_values": pixel_values,
"image_sizes": image_sizes,
"image_bound": image_bound,
"tgt_sizes": tgt_sizes,
}
for k in list(model_inputs.keys()):
if model_inputs[k] is None:
model_inputs.pop(k)
return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model
def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
orig_generate = hf_model.model.generate
def _generate(self, *args, image_sizes=None, **kwargs):
return orig_generate(*args, decode_text=False, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model
def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
orig_generate = hf_model.model.generate
def _generate(self, *args, image_sizes=None, **kwargs):
return orig_generate(*args, decode_text=False, **kwargs) return orig_generate(*args, decode_text=False, **kwargs)
hf_model.model.generate = types.MethodType(_generate, hf_model.model) hf_model.model.generate = types.MethodType(_generate, hf_model.model)
...@@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -551,10 +557,11 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def _generate(self, max_new_tokens=None, do_sample=None, **kwargs): def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
batch = { batch = {
k: kwargs.pop(k) k: kwargs.pop(k).unsqueeze(0)
for k in ("input_ids", "images", "image_input_idx", "image_masks") for k in ("input_ids", "images", "image_input_idx", "image_masks")
if k in kwargs if k in kwargs
} }
batch = BatchFeature(batch).to(dtype=self.dtype)
return self.generate_from_batch( return self.generate_from_batch(
batch, batch,
......
...@@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union ...@@ -8,13 +8,12 @@ from typing import Any, Callable, NamedTuple, Optional, Union
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from pytest import MarkDecorator from pytest import MarkDecorator
from transformers import AutoModelForCausalLM, BatchEncoding from transformers import AutoModelForCausalLM
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import TaskOption
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import identity
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
from ....utils import check_logprobs_close from ....utils import check_logprobs_close
...@@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple): ...@@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
# Indicates we should explicitly pass the EOS from the tokenizer # Indicates we should explicitly pass the EOS from the tokenizer
use_tokenizer_eos: bool = False use_tokenizer_eos: bool = False
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
# Callable to pass to the HF runner to run on inputs; for now, we also pass
# the data type to input post processing, because almost all of the uses of
# postprocess_inputs are to fix the data types of BatchEncoding values.
postprocess_inputs: Callable[[BatchEncoding, str],
BatchEncoding] = identity
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
# Post processors that if defined, will run oun the outputs of the # Post processors that if defined, will run oun the outputs of the
...@@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple): ...@@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
# is all combinations of .models + all fields below # is all combinations of .models + all fields below
max_tokens: Union[int, tuple[int]] = 128 max_tokens: Union[int, tuple[int]] = 128
num_logprobs: Union[int, tuple[int]] = 5 num_logprobs: Union[int, tuple[int]] = 5
dtype: Union[str, Iterable[str]] = "half" dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
# Only expanded in video tests # Only expanded in video tests
num_video_frames: Union[int, tuple[int]] = 16 num_video_frames: Union[int, tuple[int]] = 16
...@@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple): ...@@ -171,7 +165,6 @@ class VLMTestInfo(NamedTuple):
"vllm_output_post_proc": self.vllm_output_post_proc, "vllm_output_post_proc": self.vllm_output_post_proc,
"auto_cls": self.auto_cls, "auto_cls": self.auto_cls,
"use_tokenizer_eos": self.use_tokenizer_eos, "use_tokenizer_eos": self.use_tokenizer_eos,
"postprocess_inputs": self.postprocess_inputs,
"comparator": self.comparator, "comparator": self.comparator,
"get_stop_token_ids": self.get_stop_token_ids, "get_stop_token_ids": self.get_stop_token_ids,
"hf_model_kwargs": self.hf_model_kwargs, "hf_model_kwargs": self.hf_model_kwargs,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from functools import partial
from typing import Callable from typing import Callable
import pytest import pytest
import torch import torch
import torch.nn.functional as F
from PIL import Image from PIL import Image
from transformers import BatchEncoding, Qwen2VLForConditionalGeneration from transformers import Qwen2VLForConditionalGeneration
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
...@@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos( ...@@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
return prompt return prompt
def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
def _run_test( def _run_test(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
...@@ -118,14 +114,8 @@ def _run_test( ...@@ -118,14 +114,8 @@ def _run_test(
with hf_runner(model, with hf_runner(model,
dtype=dtype, dtype=dtype,
auto_cls=Qwen2VLForConditionalGeneration) as hf_model: auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
hf_model.postprocess_inputs = partial(
postprocess_inputs, prompts = []
hf_model,
cache_position=torch.arange(
0,
1, # 1 for batch size
requires_grad=False),
use_cache=False)
for text, image, embed_text in zip(input_texts, input_images, for text, image, embed_text in zip(input_texts, input_images,
embed_texts): embed_texts):
# dse requires non-standard input processing # dse requires non-standard input processing
...@@ -133,20 +123,34 @@ def _run_test( ...@@ -133,20 +123,34 @@ def _run_test(
messages = get_messages(image, text, embed_text) messages = get_messages(image, text, embed_text)
prompt = apply_chat_template_and_add_eos( prompt = apply_chat_template_and_add_eos(
messages, hf_model.processor.apply_chat_template) messages, hf_model.processor.apply_chat_template)
inputs = hf_model.get_inputs(
prompts=[[prompt]], prompts.append(prompt)
images=[[image]],
all_inputs = hf_model.get_inputs(
prompts=prompts,
images=input_images,
) )
with torch.no_grad(): with torch.no_grad():
all_outputs = []
for inputs in all_inputs:
inputs = hf_model.model.prepare_inputs_for_generation(
**inputs,
cache_position=torch.arange(1), # 1 for batch size
use_cache=False,
)
outputs = hf_model.model( outputs = hf_model.model(
**hf_model.wrap_device(inputs[0], **hf_model.wrap_device(inputs),
device=hf_model.model.device.type),
return_dict=True, return_dict=True,
output_hidden_states=True, output_hidden_states=True,
) )
pooled_output = torch.nn.functional.normalize( pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
outputs.hidden_states[-1][0, -1], p=2, dim=-1) p=2,
hf_outputs.append(pooled_output.tolist()) dim=-1)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close( check_embeddings_close(
embeddings_0_lst=hf_outputs, embeddings_0_lst=hf_outputs,
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import pytest import pytest
import torch.nn.functional as F import torch.nn.functional as F
from transformers import AutoModelForVision2Seq from transformers import AutoModelForImageTextToText
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -70,7 +70,7 @@ def _run_test( ...@@ -70,7 +70,7 @@ def _run_test(
vllm_outputs = vllm_model.encode(input_texts, images=input_images) vllm_outputs = vllm_model.encode(input_texts, images=input_images)
with hf_runner(model, dtype=dtype, with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model: auto_cls=AutoModelForImageTextToText) as hf_model:
# Patch the issue where generation_config.json is missing # Patch the issue where generation_config.json is missing
hf_model.processor.patch_size = \ hf_model.processor.patch_size = \
hf_model.model.config.vision_config.patch_size hf_model.model.config.vision_config.patch_size
...@@ -86,8 +86,7 @@ def _run_test( ...@@ -86,8 +86,7 @@ def _run_test(
for inputs in all_inputs: for inputs in all_inputs:
# Based on: https://huggingface.co/royokong/e5-v # Based on: https://huggingface.co/royokong/e5-v
outputs = hf_model.model( outputs = hf_model.model(
**hf_model.wrap_device(inputs, **hf_model.wrap_device(inputs),
device=hf_model.model.device.type),
return_dict=True, return_dict=True,
output_hidden_states=True, output_hidden_states=True,
) )
......
...@@ -53,8 +53,7 @@ def _run_test( ...@@ -53,8 +53,7 @@ def _run_test(
for inputs in all_inputs: for inputs in all_inputs:
# Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
outputs = hf_model.model( outputs = hf_model.model(
**hf_model.wrap_device(inputs, **hf_model.wrap_device(inputs),
device=hf_model.model.device.type),
return_dict=True, return_dict=True,
output_hidden_states=True, output_hidden_states=True,
) )
......
...@@ -4,8 +4,7 @@ from typing import Optional, overload ...@@ -4,8 +4,7 @@ from typing import Optional, overload
import pytest import pytest
import torch import torch
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
BatchEncoding)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.attention.backends.flash_attn import FlashAttentionMetadata from vllm.attention.backends.flash_attn import FlashAttentionMetadata
...@@ -227,14 +226,10 @@ def _run_test( ...@@ -227,14 +226,10 @@ def _run_test(
for prompts, images in inputs for prompts, images in inputs
] ]
def process(hf_inputs: BatchEncoding, **kwargs):
return hf_inputs
with hf_runner(model, with hf_runner(model,
dtype=dtype, dtype=dtype,
model_kwargs={"device_map": "auto"}, model_kwargs={"device_map": "auto"},
postprocess_inputs=process, auto_cls=AutoModelForImageTextToText) as hf_model:
auto_cls=AutoModelForVision2Seq) as hf_model:
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,
......
This diff is collapsed.
This diff is collapsed.
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import warnings import warnings
from collections.abc import Sequence from collections.abc import Sequence
from typing import Optional, Union from typing import Any, Optional, Union
import torch import torch
...@@ -254,9 +254,9 @@ def check_logprobs_close( ...@@ -254,9 +254,9 @@ def check_logprobs_close(
def build_model_context( def build_model_context(
model_id: str, model_id: str,
task: TaskOption = "auto", task: TaskOption = "auto",
dtype: Optional[Union[str, torch.dtype]] = None, dtype: Union[str, torch.dtype] = "auto",
mm_processor_kwargs: Optional[dict] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None,
limit_mm_per_prompt: Optional[dict] = None, limit_mm_per_prompt: Optional[dict[str, int]] = None,
disable_mm_preprocessor_cache: bool = True, disable_mm_preprocessor_cache: bool = True,
): ):
"""Creates an InputContext for a given model. """Creates an InputContext for a given model.
...@@ -274,9 +274,6 @@ def build_model_context( ...@@ -274,9 +274,6 @@ def build_model_context(
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip") model_info.check_transformers_version(on_fail="skip")
if dtype is None:
dtype = "half"
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task=task, task=task,
......
...@@ -7,19 +7,25 @@ from unittest.mock import MagicMock ...@@ -7,19 +7,25 @@ from unittest.mock import MagicMock
import numpy as np import numpy as np
import pytest import pytest
import torch
from transformers import ProcessorMixin from transformers import ProcessorMixin
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
MultiModalKwargsItem,
MultiModalSharedField)
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
# yapf: disable # yapf: disable
from vllm.multimodal.processing import (PlaceholderFeaturesInfo, from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
PromptIndexTargets, PromptInsertion, ProcessingCache, PromptIndexTargets,
PromptReplacement, apply_text_matches, PromptInsertion, PromptReplacement,
apply_text_matches,
apply_token_matches, apply_token_matches,
find_mm_placeholders, find_mm_placeholders,
find_text_matches, find_token_matches, find_text_matches, find_token_matches,
iter_token_matches) iter_token_matches,
replace_token_matches)
# yapf: enable # yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.profiling import MultiModalProfiler
from vllm.transformers_utils.tokenizer import (AnyTokenizer, from vllm.transformers_utils.tokenizer import (AnyTokenizer,
...@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected): ...@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
assert all(match_len == len(match_ids) for match_len in match_lens) assert all(match_len == len(match_ids) for match_len in match_lens)
# yapf: disable
@pytest.mark.parametrize(
("token_ids", "match_ids", "new_ids", "expected"),
[
([], [], [-1], []),
([], [32000], [-1], []),
(
[32000, 32000, 32000],
[32000],
[-1],
[-1, -1, -1],
),
(
[32000, 32000, 32000],
[32000, 32000],
[-1],
[-1, 32000],
),
(
[32000, 32000, 32000],
[32000, 32000, 32000],
[-1],
[-1],
),
(
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 32000],
[-1],
[9833, -1, 32000, 32000, 9833, -1, 32000, 918],
),
(
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 32000, 32000, 32000],
[-1],
[9833, -1, 9833, 28747, 32000, 32000, 918],
),
(
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 0, 32000],
[-1],
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
),
],
)
# yapf: enable
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
result = replace_token_matches(token_ids, match_ids, new_ids)
# Manually constructed results
assert result == expected
# yapf: disable # yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("prompt", "target_by_key", "expected_by_key"), ("prompt", "target_by_key", "expected_by_key"),
...@@ -837,6 +895,45 @@ def test_find_mm_placeholders( ...@@ -837,6 +895,45 @@ def test_find_mm_placeholders(
assert result == expected assert result == expected
def _dummy_elem(modality: str, key: str, size: int):
return MultiModalFieldElem(
modality=modality,
key=key,
data=torch.empty((size, ), dtype=torch.int8),
field=MultiModalSharedField(1),
)
def _dummy_item(modality: str, size_by_key: dict[str, int]):
return MultiModalKwargsItem.from_elems([
_dummy_elem(modality, key, size) for key, size in size_by_key.items()
])
def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
return MultiModalKwargs.from_items([
_dummy_item(modality, size_by_key)
for modality, size_by_key in size_by_key_modality.items()
])
# yapf: disable
@pytest.mark.parametrize(
("item", "expected_size"),
[
(_dummy_item("a", {"a1": 100}), 100),
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
(_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
],
)
# yapf: enable
def test_cache_item_size(item, expected_size):
cache = ProcessingCache.get_lru_cache(2048, type(item))
cache[""] = item
assert cache.currsize == expected_size
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
("limit", "num_supported", "is_valid"), ("limit", "num_supported", "is_valid"),
...@@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): ...@@ -853,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
seed=0, seed=0,
dtype="half", dtype="auto",
revision=None, revision=None,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
) )
...@@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ...@@ -892,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
seed=0, seed=0,
dtype="half", dtype="auto",
revision=None, revision=None,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
) )
...@@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): ...@@ -965,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
seed=0, seed=0,
dtype="half", dtype="auto",
revision=None, revision=None,
) )
......
...@@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, ...@@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
# Test edge cases # Test edge cases
(1, 128, 16, 1024, 4, 2, 16, False), # large decode batch (1, 128, 16, 1024, 4, 2, 16, False), # large decode batch
(16, 4, 8, 8192, 48, 1, 128, True), # large prefill batch (16, 4, 8, 1024, 4, 2, 128, True), # large prefill batch
(4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA) (4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA)
(4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA) (4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA)
]) ])
......
...@@ -15,6 +15,8 @@ from ..utils import compare_two_settings, create_new_process_for_each_test ...@@ -15,6 +15,8 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
models_4bit_to_test = [ models_4bit_to_test = [
("facebook/opt-125m", "quantize opt model inflight"), ("facebook/opt-125m", "quantize opt model inflight"),
("mistralai/Mistral-7B-Instruct-v0.3",
"quantize inflight model with both HF and Mistral format weights")
] ]
models_pre_qaunt_4bit_to_test = [ models_pre_qaunt_4bit_to_test = [
......
...@@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): ...@@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
test_prompts = multilora_inference.create_test_prompts(lora_path) test_prompts = multilora_inference.create_test_prompts(lora_path)
# Serialize model before deserializing and binding LoRA adapters # Serialize model before deserializing and binding LoRA adapters
with vllm_runner(model_ref, ) as vllm_model: with vllm_runner(model_ref) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
vllm_model.apply_model( vllm_model.apply_model(
...@@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner): ...@@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
## Serialize model ## Serialize model
with vllm_runner(model_ref, ) as vllm_model: with vllm_runner(model_ref) as vllm_model:
model_path = tmp_path / (model_ref + ".tensors") model_path = tmp_path / (model_ref + ".tensors")
vllm_model.apply_model( vllm_model.apply_model(
......
...@@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir): ...@@ -34,7 +34,9 @@ with depyf.prepare_debug(temp_dir):
# disable custom dispatcher, let Dynamo takes over # disable custom dispatcher, let Dynamo takes over
# all the control # all the control
llm = LLM(model="google/gemma-2b", llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
max_model_len=512,
max_num_seqs=64,
enforce_eager=True, enforce_eager=True,
compilation_config={"level": CompilationLevel.DYNAMO_AS_IS}) compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir): ...@@ -44,38 +46,51 @@ with depyf.prepare_debug(temp_dir):
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
assert generated_text.startswith(answer) assert generated_text.startswith(answer)
compiled_code = sorted( compiled_codes = sorted(
glob.glob(os.path.join(temp_dir, "__transformed_code*.py"))) glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
# we should only trigger Dynamo compilation three times: for i, compiled_code in enumerate(compiled_codes):
# one for the profiling phase without kv cache print("{} file: {}".format(i + 1, compiled_code))
# one for the prefill phase with symbolic shapes
# one for the decode phase with symbolic shapes # We should only trigger Dynamo compilation 4 times:
# 1. forward pass (symbolic)
# 2. compute_logits (symbolic)
# 3. forward pass (shape 16)
# 4. forward pass (shape 32)
# and later calls should not trigger Dynamo compilation again. # and later calls should not trigger Dynamo compilation again.
# NOTE: it might still trigger XLA compilation. # NOTE: It might still trigger XLA compilation.
# Check we have 4 compiled codes
assert len(compiled_codes) == 4
# check we have three compiled code kv_cache_prefix = "kv_cache"
# this is the assumption when we use the custom dispatcher attn_prefix = "ragged_paged_attention"
assert len(compiled_code) == 3
# check all the compilations are as expected # Check all the compilations are as expected
compiled_fn = sorted( compiled_fns = sorted(
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py"))) glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
# the first compilation is the profiling phase, for i, compiled_fn in enumerate(compiled_fns):
# it should not have any kv cache print("{} file: {}".format(i + 1, compiled_fn))
with open(compiled_fn[0]) as f:
# The first compilation is symbolic, so it should not have any kv_caches
with open(compiled_fns[0]) as f:
content = f.read()
assert kv_cache_prefix not in content
# The second compilation is symbolic, so it should not have any kv_caches
with open(compiled_fns[1]) as f:
content = f.read() content = f.read()
assert "kv_caches" not in content assert kv_cache_prefix not in content
# the second compilation is the prefill phase, # The third compilation is shape 16, so it should have kv_caches and the
# it should have kv cache and the flash_attention op # ragged_paged_attention
with open(compiled_fn[1]) as f: with open(compiled_fns[2]) as f:
content = f.read() content = f.read()
assert "kv_caches" in content and "torch.ops.xla.flash_attention" in content assert (kv_cache_prefix in content and attn_prefix in content)
# the third compilation is the decode phase, # The forth compilation is shape 32, so it should have kv_caches and the
# it should have kv cache and the paged_attention op # ragged_paged_attention
with open(compiled_fn[2]) as f: with open(compiled_fns[3]) as f:
content = f.read() content = f.read()
assert "kv_caches" in content and "torch.ops.xla.paged_attention" in content assert (kv_cache_prefix in content and attn_prefix in content)
...@@ -14,12 +14,17 @@ from ..utils import compare_two_settings ...@@ -14,12 +14,17 @@ from ..utils import compare_two_settings
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch): def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_RPC_TIMEOUT", "30000") m.setenv("VLLM_RPC_TIMEOUT", "30000")
compare_two_settings( compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
"google/gemma-2b",
arg1=[ arg1=[
"--max-model-len=256",
"--max-num-seqs=32",
"--enforce-eager", "--enforce-eager",
f"-O{CompilationLevel.DYNAMO_ONCE}", f"-O{CompilationLevel.DYNAMO_ONCE}",
], ],
arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], arg2=[
"--max-model-len=256", "--max-num-seqs=32",
"--enforce-eager",
f"-O{CompilationLevel.DYNAMO_AS_IS}"
],
env1={}, env1={},
env2={}) env2={})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment