Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of a GPTQ model to a Marlin_24 model. """Compare the outputs of a GPTQ model to a Marlin_24 model.
Note: GPTQ and Marlin_24 do not have bitwise correctness. Note: GPTQ and Marlin_24 do not have bitwise correctness.
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Granite models using greedy sampling. """Compare the outputs of HF and vLLM for Granite models using greedy sampling.
Run `pytest tests/models/test_granite.py`. Run `pytest tests/models/test_granite.py`.
......
# SPDX-License-Identifier: Apache-2.0
import pytest import pytest
import os import os
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba. """Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
Run `pytest tests/models/test_mamba.py`. Run `pytest tests/models/test_mamba.py`.
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling. """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
Run `pytest tests/models/test_mistral.py`. Run `pytest tests/models/test_mistral.py`.
......
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa # flake8: noqa
"""Tests Model Optimizer fp8 models against ground truth generation """Tests Model Optimizer fp8 models against ground truth generation
Note: these tests will only pass on H100 Note: these tests will only pass on H100
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM when using greedy sampling. """Compare the outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/models/test_models.py`. Run `pytest tests/models/test_models.py`.
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for moe models using greedy sampling. """Compare the outputs of HF and vLLM for moe models using greedy sampling.
Run `pytest tests/models/test_phimoe.py`. Run `pytest tests/models/test_phimoe.py`.
......
# SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Type from typing import List, Optional, Type
import os import os
......
# SPDX-License-Identifier: Apache-2.0
from typing import Optional from typing import Optional
import os import os
......
# SPDX-License-Identifier: Apache-2.0
"""Common tests for testing .generate() functionality for single / multiple """Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM. image, embedding, and video support for different VLMs in vLLM.
""" """
...@@ -9,6 +10,7 @@ from typing import Type ...@@ -9,6 +10,7 @@ from typing import Type
import os import os
import pytest import pytest
from packaging.version import Version
from transformers import AutoModelForVision2Seq from transformers import AutoModelForVision2Seq
from transformers import __version__ as TRANSFORMERS_VERSION from transformers import __version__ as TRANSFORMERS_VERSION
...@@ -121,6 +123,8 @@ VLM_TEST_SETTINGS = { ...@@ -121,6 +123,8 @@ VLM_TEST_SETTINGS = {
else ("half", "float")), else ("half", "float")),
marks=[pytest.mark.core_model], marks=[pytest.mark.core_model],
), ),
# TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
# once we upgraded to transformers>=4.49.0.
"qwen2_vl": VLMTestInfo( "qwen2_vl": VLMTestInfo(
models=[os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")], models=[os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")],
test_type=( test_type=(
...@@ -138,6 +142,26 @@ VLM_TEST_SETTINGS = { ...@@ -138,6 +142,26 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
"qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skipif(
TRANSFORMERS_VERSION < "4.49.0",
reason="HF model requires transformers>=4.49.0",
), pytest.mark.core_model, pytest.mark.cpu_model],
),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( "aria": VLMTestInfo(
models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")], models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
...@@ -155,13 +179,7 @@ VLM_TEST_SETTINGS = { ...@@ -155,13 +179,7 @@ VLM_TEST_SETTINGS = {
stop_str=["<|im_end|>"], stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)], image_size_factors=[(0.10, 0.15)],
max_tokens=64, max_tokens=64,
marks=[ marks=[large_gpu_mark(min_gb=64)],
pytest.mark.skipif(
TRANSFORMERS_VERSION < "4.48.0",
reason="HF model requires transformers>=4.48.0",
),
large_gpu_mark(min_gb=64),
],
), ),
"blip2": VLMTestInfo( "blip2": VLMTestInfo(
models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")], models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
...@@ -207,8 +225,8 @@ VLM_TEST_SETTINGS = { ...@@ -207,8 +225,8 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
marks=[ marks=[
pytest.mark.skipif( pytest.mark.skipif(
TRANSFORMERS_VERSION >= "4.48.0", Version(TRANSFORMERS_VERSION) >= Version("4.48"),
reason="HF model is not compatible with transformers>=4.48.0", reason="HF model is not compatible with transformers>=4.48",
) )
], ],
), ),
...@@ -251,17 +269,18 @@ VLM_TEST_SETTINGS = { ...@@ -251,17 +269,18 @@ VLM_TEST_SETTINGS = {
max_model_len=8192, max_model_len=8192,
dtype="bfloat16", dtype="bfloat16",
use_tokenizer_eos=True, use_tokenizer_eos=True,
num_logprobs=10,
patch_hf_runner=model_utils.h2ovl_patch_hf_runner, patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
), ),
"idefics3": VLMTestInfo( "idefics3": VLMTestInfo(
models=[os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")], models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501 prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
marks=[large_gpu_mark(min_gb=48)], hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
), ),
"intern_vl": VLMTestInfo( "intern_vl": VLMTestInfo(
models=[ models=[
...@@ -283,7 +302,6 @@ VLM_TEST_SETTINGS = { ...@@ -283,7 +302,6 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16", dtype="bfloat16",
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner, patch_hf_runner=model_utils.internvl_patch_hf_runner,
marks=[large_gpu_mark(min_gb=32)],
), ),
"llava_next": VLMTestInfo( "llava_next": VLMTestInfo(
models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")], models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
...@@ -340,6 +358,12 @@ VLM_TEST_SETTINGS = { ...@@ -340,6 +358,12 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
patch_hf_runner=model_utils.mantis_patch_hf_runner, patch_hf_runner=model_utils.mantis_patch_hf_runner,
marks=[
pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) >= Version("4.48"),
reason="HF model is not compatible with transformers>=4.48",
)
],
), ),
"minicpmv_25": VLMTestInfo( "minicpmv_25": VLMTestInfo(
models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")], models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")],
......
# SPDX-License-Identifier: Apache-2.0
import os import os
import re import re
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling. """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
Run `pytest tests/models/test_mistral.py`. Run `pytest tests/models/test_mistral.py`.
......
# SPDX-License-Identifier: Apache-2.0
from typing import Any, List, Optional, Tuple, Type, TypedDict, Union from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
import os import os
......
# SPDX-License-Identifier: Apache-2.0
"""Helpers for building inputs that can be leveraged for different test types. """Helpers for building inputs that can be leveraged for different test types.
""" """
from pathlib import PosixPath from pathlib import PosixPath
......
# SPDX-License-Identifier: Apache-2.0
"""Utils for determining which subset of model tests belong to a specific """Utils for determining which subset of model tests belong to a specific
modality, getting all combinations (similar to pytest's parametrization), modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on. handling multimodal placeholder substitution, and so on.
......
# SPDX-License-Identifier: Apache-2.0
"""Core test implementation to be shared across modalities.""" """Core test implementation to be shared across modalities."""
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
...@@ -153,4 +154,4 @@ def process_runner_outputs( ...@@ -153,4 +154,4 @@ def process_runner_outputs(
def process_outputs(output_processor, model, outputs_per_image): def process_outputs(output_processor, model, outputs_per_image):
"""Applies a model specific post-processor function to a runner's output""" """Applies a model specific post-processor function to a runner's output"""
return [[output_processor(res, model) for res in outputs] return [[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image] for outputs in outputs_per_image]
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
"""Custom input builders for edge-cases in different models.""" """Custom input builders for edge-cases in different models."""
from typing import Callable from typing import Callable
......
# SPDX-License-Identifier: Apache-2.0
"""Common utility functions relating to different models that are useful """Common utility functions relating to different models that are useful
for manipulating the input / output of HF & vLLM test runners, which are for manipulating the input / output of HF & vLLM test runners, which are
typically specific to a small subset of models. typically specific to a small subset of models.
...@@ -191,6 +192,14 @@ def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, ...@@ -191,6 +192,14 @@ def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
return output_ids, output_str, out_logprobs return output_ids, output_str, out_logprobs
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end_of_utterance>"):
output_str = output_str.split("<end_of_utterance>")[0]
return output_ids, output_str, out_logprobs
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput: model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output output_ids, output_str, out_logprobs = hf_output
...@@ -333,12 +342,12 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -333,12 +342,12 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def __init__(self, hf_runner: HfRunner): def __init__(self, hf_runner: HfRunner):
self.num_image_token = hf_runner.model.num_image_token self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer self.tokenizer = hf_runner.tokenizer
self.dtype = hf_runner.model.dtype
self.config = AutoConfig.from_pretrained(hf_runner.model_name, self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True) trust_remote_code=True)
self.vision_config = self.config.vision_config self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail self.use_thumbnail = self.config.use_thumbnail
self.use_msac = self.config.use_msac
self.min_num = self.config.min_dynamic_patch self.min_num = self.config.min_dynamic_patch
self.max_num = self.config.max_dynamic_patch self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size self.image_size = self.vision_config.image_size
...@@ -347,18 +356,19 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -347,18 +356,19 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
**kwargs): **kwargs):
# yapf: disable # yapf: disable
from vllm.model_executor.models.h2ovl import ( from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
# yapf: enable # yapf: enable
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
pixel_values = [ pixel_values = [
image_to_pixel_values(image, image_to_pixel_values_h2ovl(
self.image_size, image,
self.min_num, input_size=self.image_size,
self.max_num, min_num=self.min_num,
self.use_thumbnail, max_num=self.max_num,
use_MSAC=self.config.use_msac).to( use_thumbnail=self.use_thumbnail,
self.dtype) for image in images use_msac=self.use_msac,
) for image in images
] ]
num_patches_list = [ num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values pixel_value.shape[0] for pixel_value in pixel_values
...@@ -393,7 +403,6 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -393,7 +403,6 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def __init__(self, hf_runner: HfRunner): def __init__(self, hf_runner: HfRunner):
self.num_image_token = hf_runner.model.num_image_token self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer self.tokenizer = hf_runner.tokenizer
self.dtype = hf_runner.model.dtype
self.config = AutoConfig.from_pretrained(hf_runner.model_name, self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True) trust_remote_code=True)
...@@ -406,13 +415,17 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -406,13 +415,17 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def __call__(self, text: str, images: Union[Image, List[Image]], def __call__(self, text: str, images: Union[Image, List[Image]],
**kwargs): **kwargs):
from vllm.model_executor.models.internvl import ( from vllm.model_executor.models.internvl import (
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) IMG_CONTEXT, IMG_END, IMG_START,
image_to_pixel_values_internvl)
images = [images] if isinstance(images, Image) else images images = [images] if isinstance(images, Image) else images
pixel_values = [ pixel_values = [
image_to_pixel_values(image, self.image_size, self.min_num, image_to_pixel_values_internvl(
self.max_num, image,
self.use_thumbnail).to(self.dtype) input_size=self.image_size,
for image in images min_num=self.min_num,
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
) for image in images
] ]
num_patches_list = [ num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values pixel_value.shape[0] for pixel_value in pixel_values
...@@ -447,7 +460,8 @@ def _internvl_generate( ...@@ -447,7 +460,8 @@ def _internvl_generate(
) -> torch.LongTensor: ) -> torch.LongTensor:
"""Generate method for InternVL2 model without fixed use_cache.""" """Generate method for InternVL2 model without fixed use_cache."""
assert self.img_context_token_id is not None assert self.img_context_token_id is not None
vit_embeds = self.extract_feature(pixel_values) target_dtype = next(self.parameters()).dtype
vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
input_embeds = self.language_model.get_input_embeddings()(input_ids) input_embeds = self.language_model.get_input_embeddings()(input_ids)
B, N, C = input_embeds.shape B, N, C = input_embeds.shape
input_embeds = input_embeds.reshape(B * N, C) input_embeds = input_embeds.reshape(B * N, C)
......
# SPDX-License-Identifier: Apache-2.0
"""Entrypoints for wrapping the core run_test implementation for specific test """Entrypoints for wrapping the core run_test implementation for specific test
types / modalities. types / modalities.
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment