Unverified Commit b89fb2a4 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Use `AutoModelForImageTextToText` to load VLMs in tests (#14945)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 5340b0e2
...@@ -9,7 +9,7 @@ from pathlib import PosixPath ...@@ -9,7 +9,7 @@ from pathlib import PosixPath
import pytest import pytest
from packaging.version import Version from packaging.version import Version
from transformers import AutoModelForPreTraining, AutoModelForVision2Seq from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
from transformers import __version__ as TRANSFORMERS_VERSION from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = { ...@@ -101,7 +101,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings, convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
...@@ -121,7 +121,7 @@ VLM_TEST_SETTINGS = { ...@@ -121,7 +121,7 @@ VLM_TEST_SETTINGS = {
"stop_sign": "caption es", "stop_sign": "caption es",
"cherry_blossom": "What is in the picture?", "cherry_blossom": "What is in the picture?",
}), }),
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
), ),
...@@ -190,7 +190,7 @@ VLM_TEST_SETTINGS = { ...@@ -190,7 +190,7 @@ VLM_TEST_SETTINGS = {
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:", prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output, vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
), ),
"chameleon": VLMTestInfo( "chameleon": VLMTestInfo(
...@@ -199,7 +199,7 @@ VLM_TEST_SETTINGS = { ...@@ -199,7 +199,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
), ),
...@@ -240,6 +240,7 @@ VLM_TEST_SETTINGS = { ...@@ -240,6 +240,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
use_tokenizer_eos=True, use_tokenizer_eos=True,
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10, num_logprobs=10,
...@@ -256,8 +257,7 @@ VLM_TEST_SETTINGS = { ...@@ -256,8 +257,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
# TODO: Use AutoModelForVision2Seq once transformers supports this auto_cls=AutoModelForImageTextToText,
auto_cls=AutoModelForPreTraining,
dtype="bfloat16", dtype="bfloat16",
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
...@@ -307,7 +307,7 @@ VLM_TEST_SETTINGS = { ...@@ -307,7 +307,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
hf_output_post_proc=model_utils.idefics3_trunc_hf_output, hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
), ),
"intern_vl": VLMTestInfo( "intern_vl": VLMTestInfo(
...@@ -336,7 +336,7 @@ VLM_TEST_SETTINGS = { ...@@ -336,7 +336,7 @@ VLM_TEST_SETTINGS = {
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions( custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
...@@ -382,7 +382,7 @@ VLM_TEST_SETTINGS = { ...@@ -382,7 +382,7 @@ VLM_TEST_SETTINGS = {
"pixel_values" "pixel_values"
), ),
get_stop_token_ids=lambda tok: [128009], get_stop_token_ids=lambda tok: [128009],
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
patch_hf_runner=model_utils.mantis_patch_hf_runner, patch_hf_runner=model_utils.mantis_patch_hf_runner,
marks=[ marks=[
...@@ -463,7 +463,7 @@ VLM_TEST_SETTINGS = { ...@@ -463,7 +463,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: "[IMG]", img_idx_to_prompt=lambda idx: "[IMG]",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"qwen_vl": VLMTestInfo( "qwen_vl": VLMTestInfo(
...@@ -481,7 +481,7 @@ VLM_TEST_SETTINGS = { ...@@ -481,7 +481,7 @@ VLM_TEST_SETTINGS = {
models=["facebook/chameleon-7b"], models=["facebook/chameleon-7b"],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
), ),
...@@ -495,7 +495,7 @@ VLM_TEST_SETTINGS = { ...@@ -495,7 +495,7 @@ VLM_TEST_SETTINGS = {
models=["llava-hf/llava-1.5-7b-hf"], models=["llava-hf/llava-1.5-7b-hf"],
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
max_model_len=4096, max_model_len=4096,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS # type: ignore
...@@ -504,7 +504,7 @@ VLM_TEST_SETTINGS = { ...@@ -504,7 +504,7 @@ VLM_TEST_SETTINGS = {
models=["llava-hf/llava-v1.6-mistral-7b-hf"], models=["llava-hf/llava-v1.6-mistral-7b-hf"],
prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]", prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
max_model_len=10240, max_model_len=10240,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
marks=multi_gpu_marks(num_gpus=2), marks=multi_gpu_marks(num_gpus=2),
**COMMON_BROADCAST_SETTINGS # type: ignore **COMMON_BROADCAST_SETTINGS # type: ignore
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import pytest import pytest
import torch.nn.functional as F import torch.nn.functional as F
from transformers import AutoModelForVision2Seq from transformers import AutoModelForImageTextToText
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -70,7 +70,7 @@ def _run_test( ...@@ -70,7 +70,7 @@ def _run_test(
vllm_outputs = vllm_model.encode(input_texts, images=input_images) vllm_outputs = vllm_model.encode(input_texts, images=input_images)
with hf_runner(model, dtype=dtype, with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model: auto_cls=AutoModelForImageTextToText) as hf_model:
# Patch the issue where generation_config.json is missing # Patch the issue where generation_config.json is missing
hf_model.processor.patch_size = \ hf_model.processor.patch_size = \
hf_model.model.config.vision_config.patch_size hf_model.model.config.vision_config.patch_size
......
...@@ -4,8 +4,8 @@ from typing import Optional, overload ...@@ -4,8 +4,8 @@ from typing import Optional, overload
import pytest import pytest
import torch import torch
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, from transformers import (AutoConfig, AutoModelForImageTextToText,
BatchEncoding) AutoTokenizer, BatchEncoding)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.attention.backends.flash_attn import FlashAttentionMetadata from vllm.attention.backends.flash_attn import FlashAttentionMetadata
...@@ -234,7 +234,7 @@ def _run_test( ...@@ -234,7 +234,7 @@ def _run_test(
dtype=dtype, dtype=dtype,
model_kwargs={"device_map": "auto"}, model_kwargs={"device_map": "auto"},
postprocess_inputs=process, postprocess_inputs=process,
auto_cls=AutoModelForVision2Seq) as hf_model: auto_cls=AutoModelForImageTextToText) as hf_model:
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts, hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens, max_tokens,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment