Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of a GPTQ model to a Marlin_24 model.
 Note: GPTQ and Marlin_24 do not have bitwise correctness.

--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Granite models using greedy sampling.
 Run `pytest tests/models/test_granite.py`.

--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
+# SPDX-License-Identifier: Apache-2.0
 import pytest
 import os

--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
 Run `pytest tests/models/test_mamba.py`.

--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 Run `pytest tests/models/test_mistral.py`.

--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
+# SPDX-License-Identifier: Apache-2.0
 # flake8: noqa
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100

--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM when using greedy sampling.
 Run `pytest tests/models/test_models.py`.

--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for moe models using greedy sampling.
 Run `pytest tests/models/test_phimoe.py`.

--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List, Optional, Type
 import os

--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import Optional
 import os

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
+# SPDX-License-Identifier: Apache-2.0
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
@@ -9,6 +10,7 @@ from typing import Type
 import os
 import pytest
+from packaging.version import Version
 from transformers import AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
@@ -121,6 +123,8 @@ VLM_TEST_SETTINGS = {
               else ("half", "float")),
        marks=[pytest.mark.core_model],
    ),
+    # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
+    # once we upgraded to transformers>=4.49.0.
    "qwen2_vl": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")],
        test_type=(
@@ -138,6 +142,26 @@ VLM_TEST_SETTINGS = {
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
+    "qwen2_5_vl": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.skipif(
+                TRANSFORMERS_VERSION < "4.49.0",
+                reason="HF model requires transformers>=4.49.0",
+            ), pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
    #### Extended model tests
    "aria": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
@@ -155,13 +179,7 @@ VLM_TEST_SETTINGS = {
        stop_str=["<|im_end|>"],
        image_size_factors=[(0.10, 0.15)],
        max_tokens=64,
-        marks=[
+        marks=[large_gpu_mark(min_gb=64)],
-            pytest.mark.skipif(
-                TRANSFORMERS_VERSION < "4.48.0",
-                reason="HF model requires transformers>=4.48.0",
-            ),
-            large_gpu_mark(min_gb=64),
-        ],
    ),
    "blip2": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
@@ -207,8 +225,8 @@ VLM_TEST_SETTINGS = {
        image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
        marks=[
            pytest.mark.skipif(
-                TRANSFORMERS_VERSION >= "4.48.0",
+                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48.0",
+                reason="HF model is not compatible with transformers>=4.48",
            )
        ],
    ),
@@ -251,17 +269,18 @@ VLM_TEST_SETTINGS = {
        max_model_len=8192,
        dtype="bfloat16",
        use_tokenizer_eos=True,
+        num_logprobs=10,
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
    ),
    "idefics3": VLMTestInfo(
-        models=[os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")],
+        models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
-        marks=[large_gpu_mark(min_gb=48)],
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
    ),
    "intern_vl": VLMTestInfo(
        models=[
@@ -283,7 +302,6 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
    ),
    "llava_next": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
@@ -340,6 +358,12 @@ VLM_TEST_SETTINGS = {
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
        patch_hf_runner=model_utils.mantis_patch_hf_runner,
+        marks=[
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
+                reason="HF model is not compatible with transformers>=4.48",
+            )
+        ],
    ),
    "minicpmv_25": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")],

--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import re
 from typing import List, Optional, Tuple, Type

--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
 Run `pytest tests/models/test_mistral.py`.

--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
 import os

--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+# SPDX-License-Identifier: Apache-2.0
 """Helpers for building inputs that can be leveraged for different test types.
 """
 from pathlib import PosixPath

--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+# SPDX-License-Identifier: Apache-2.0
 """Utils for determining which subset of model tests belong to a specific
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.

--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
+# SPDX-License-Identifier: Apache-2.0
 """Core test implementation to be shared across modalities."""
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
@@ -153,4 +154,4 @@ def process_runner_outputs(
 def process_outputs(output_processor, model, outputs_per_image):
    """Applies a model specific post-processor function to a runner's output"""
    return [[output_processor(res, model) for res in outputs]
            for outputs in outputs_per_image]
\ No newline at end of file
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+# SPDX-License-Identifier: Apache-2.0
 """Custom input builders for edge-cases in different models."""
 from typing import Callable

--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+# SPDX-License-Identifier: Apache-2.0
 """Common utility functions relating to different models that are useful
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
@@ -191,6 +192,14 @@ def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
    return output_ids, output_str, out_logprobs
+def idefics3_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<end_of_utterance>"):
+        output_str = output_str.split("<end_of_utterance>")[0]
+    return output_ids, output_str, out_logprobs
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                             model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
@@ -333,12 +342,12 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        def __init__(self, hf_runner: HfRunner):
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype
            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
                                                     trust_remote_code=True)
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
+            self.use_msac = self.config.use_msac
            self.min_num = self.config.min_dynamic_patch
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size
@@ -347,18 +356,19 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                     **kwargs):
            # yapf: disable
            from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
            # yapf: enable
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
-                image_to_pixel_values(image,
+                image_to_pixel_values_h2ovl(
-                                      self.image_size,
+                    image,
-                                      self.min_num,
+                    input_size=self.image_size,
-                                      self.max_num,
+                    min_num=self.min_num,
-                                      self.use_thumbnail,
+                    max_num=self.max_num,
-                                      use_MSAC=self.config.use_msac).to(
+                    use_thumbnail=self.use_thumbnail,
-                                          self.dtype) for image in images
+                    use_msac=self.use_msac,
+                ) for image in images
            ]
            num_patches_list = [
                pixel_value.shape[0] for pixel_value in pixel_values
@@ -393,7 +403,6 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        def __init__(self, hf_runner: HfRunner):
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype
            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
                                                     trust_remote_code=True)
@@ -406,13 +415,17 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        def __call__(self, text: str, images: Union[Image, List[Image]],
                     **kwargs):
            from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_internvl)
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
-                image_to_pixel_values(image, self.image_size, self.min_num,
+                image_to_pixel_values_internvl(
-                                      self.max_num,
+                    image,
-                                      self.use_thumbnail).to(self.dtype)
+                    input_size=self.image_size,
-                for image in images
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
            ]
            num_patches_list = [
                pixel_value.shape[0] for pixel_value in pixel_values
@@ -447,7 +460,8 @@ def _internvl_generate(
 ) -> torch.LongTensor:
    """Generate method for InternVL2 model without fixed use_cache."""
    assert self.img_context_token_id is not None
-    vit_embeds = self.extract_feature(pixel_values)
+    target_dtype = next(self.parameters()).dtype
+    vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
    input_embeds = self.language_model.get_input_embeddings()(input_ids)
    B, N, C = input_embeds.shape
    input_embeds = input_embeds.reshape(B * N, C)

--- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+# SPDX-License-Identifier: Apache-2.0
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """