Merge tag 'v0.6.1' into v0.6.1-dev

4851c202 · zhuwenwen · 9b902f9e · 3fd2b0d2 · 4851c202 · 4851c202
Commit 4851c202 authored Sep 13, 2024 by zhuwenwen
20 changed files
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -7,26 +7,6 @@ import pytest

 from tests.quantization.utils import is_quant_method_supported

-# In this test we hardcode prompts and generations for the model so we don't
-# need to require the AQLM package as a dependency
-example_prompts = [
-    'vLLM is a high-throughput and memory-efficient inference and serving '
-    'engine for LLMs.\n',
-    'Briefly describe the major milestones in the development of artificial '
-    'intelligence from 1950 to 2020.\n',
-    'Compare and contrast artificial intelligence with human intelligence in '
-    'terms of processing information.\n',
-    'Describe the basic components of a neural network and how it can be '
-    'trained.\n',
-    'Write a short story about a robot that dreams for the first time.\n',
-    'Analyze the impact of the COVID-19 pandemic on global economic structures '
-    'and future business models.\n',
-    'Explain the cultural significance of the Mona Lisa painting, and how its '
-    'perception might vary in Western versus Eastern societies.\n',
-    "Translate the following English sentence into Japanese, French, and "
-    "Swahili: 'The early bird catches the worm.'\n"
-]
-
 # These ground truth generations were generated using `transformers==4.38.1
 # aqlm==1.1.0 torch==2.2.0`
 # and the below code:

--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
 import types
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, Union

 import pytest
 import torch
@@ -9,7 +9,8 @@ from transformers import AutoConfig
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                        _ImageAssets)
 from .utils import check_logprobs_close

 pytestmark = pytest.mark.vlm
@@ -20,6 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "cherry_blossom":
    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
 })
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501

 models = [
    "OpenGVLab/InternVL2-1B",
@@ -64,13 +66,13 @@ def generate(
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
-    image_assets: _ImageAssets,
+    inputs: List[Tuple[List[str], PromptImageInput]],
    model: str,
    *,
-    size_factors: List[float],
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
+    mm_limit: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
@@ -83,12 +85,6 @@ def run_test(
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]

    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
@@ -110,13 +106,21 @@ def run_test(
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size

-        def __call__(self, text: str, images: Image, **kwargs):
+        def __call__(self, text: str, images: Union[Image, List[Image]],
+                     **kwargs):
            from vllm.model_executor.models.internvl import (
                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
-            pixel_values = image_to_pixel_values(
-                images, self.image_size, self.min_num, self.max_num,
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values(image, self.image_size, self.min_num,
+                                      self.max_num,
                                      self.use_thumbnail).to(self.dtype)
-            num_patches_list = [pixel_values.shape[0]]
+                for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
                context_tokens = IMG_CONTEXT * self.num_image_token \
                    * num_patches
@@ -130,6 +134,7 @@ def run_test(
    with vllm_runner(model,
                     max_model_len=4096,
                     dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
@@ -138,7 +143,7 @@ def run_test(
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

    with hf_runner(model, dtype=dtype) as hf_model:
@@ -156,7 +161,7 @@ def run_test(
                                                    num_logprobs=num_logprobs,
                                                    images=hf_images,
                                                    eos_token_id=eos_token_id)
-            for prompts, hf_images in inputs_per_image
+            for prompts, hf_images in inputs
        ]

    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
@@ -264,15 +269,64 @@ if is_cpu():
 @torch.inference_mode()
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
    run_test(
        hf_runner,
        vllm_runner,
-        image_assets,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.5, 0.75, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_tokens: int,
+                             num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_case,
        model,
-        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
+        mm_limit=2,
        tensor_parallel_size=1,
    )


--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type, overload

 import pytest
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
@@ -8,11 +8,14 @@ from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

-from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
+                        _ImageAssets)
 from .utils import check_logprobs_close

 pytestmark = pytest.mark.vlm

+_LIMIT_IMAGE_PER_PROMPT = 4
+
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
@@ -52,6 +55,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
    return hf_output_ids, hf_output_str, out_logprobs


+@overload
 def run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
@@ -64,6 +68,78 @@ def run_test(
    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    sizes: List[Tuple[int, int]],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: Optional[List[float]] = None,
+    sizes: Optional[List[Tuple[int, int]]] = None,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    images = [asset.pil_image for asset in image_assets]
+
+    if size_factors is not None:
+        inputs_per_image = [(
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    elif sizes is not None:
+        inputs_per_image = [(
+            [prompt for _ in sizes],
+            [image.resize(size) for size in sizes],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
+
+    _run_test(hf_runner,
+              vllm_runner,
+              inputs_per_image,
+              model,
+              dtype=dtype,
+              max_tokens=max_tokens,
+              num_logprobs=num_logprobs,
+              tensor_parallel_size=tensor_parallel_size,
+              distributed_executor_backend=distributed_executor_backend)
+
+
+def _run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
 ):
    """Inference result should be the same between hf and vllm.

@@ -85,13 +161,6 @@ def run_test(
    else:
        mantis_processor = None

-    images = [asset.pil_image for asset in image_assets]
-
-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
@@ -100,15 +169,18 @@ def run_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     dtype=dtype,
+                     max_model_len=4096,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True) as vllm_model:
+                     enforce_eager=True,
+                     limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
+                                          }) as vllm_model:
        vllm_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
                                                max_tokens,
                                                num_logprobs=num_logprobs,
                                                images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

    if mantis_processor is not None:
@@ -131,7 +203,7 @@ def run_test(
                                                    max_tokens,
                                                    num_logprobs=num_logprobs,
                                                    images=images)
-            for prompts, images in inputs_per_image
+            for prompts, images in inputs
        ]

    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
@@ -181,6 +253,51 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
    )


+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
+                                      model, dtype, max_tokens,
+                                      num_logprobs) -> None:
+    stop_sign = image_assets[0].pil_image
+    cherry_blossom = image_assets[1].pil_image
+
+    inputs = [(
+        [
+            "USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
+            "USER: <image><image>\nDescribe 2 images.\nASSISTANT:",
+            "USER: <image><image><image><image>\nDescribe 4 images.\nASSISTANT:",  # noqa: E501
+            "USER: <image>\nWhat is the season?\nASSISTANT:",
+        ],
+        [
+            [stop_sign, cherry_blossom],
+            # Images with different sizes and aspect-ratios
+            [
+                rescale_image_size(stop_sign, 0.1),
+                stop_sign,
+            ],
+            [
+                stop_sign,
+                rescale_image_size(stop_sign, 0.25),
+                cherry_blossom.resize((183, 488)),
+                cherry_blossom.resize((488, 183))
+            ],
+            cherry_blossom,
+        ])]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        inputs,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
 @pytest.mark.parametrize("model", models)
 def test_context_length_too_short(vllm_runner, image_assets, model):
    images = [asset.pil_image for asset in image_assets]

--- a/tests/models/test_llava_next_video.py
+++ b/tests/models/test_llava_next_video.py
+from typing import List, Optional, Tuple, Type, overload
+
+import pytest
+import transformers
+from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
+
+from vllm.multimodal.utils import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
+from vllm.sequence import SampleLogprobs
+
+from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+_PREFACE = (
+    "A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's "
+    "questions.")
+
+HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "sample_demo_1":
+    f"{_PREFACE}USER: <video>\nWhy is this video funny? ASSISTANT:"
+})
+
+models = ["llava-hf/LLaVA-NeXT-Video-7B-hf"]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+
+    config = AutoConfig.from_pretrained(model)
+    video_token_id = config.video_token_index
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
+    ]
+
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+@overload
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    sizes: List[Tuple[int, int]],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    ...
+
+
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    video_assets: _VideoAssets,
+    model: str,
+    *,
+    size_factors: Optional[List[float]] = None,
+    sizes: Optional[List[Tuple[int, int]]] = None,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    num_frames: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    videos = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    for video in videos:
+        print(video.shape)
+
+    if size_factors is not None:
+        inputs_per_video = [(
+            [prompt for _ in size_factors],
+            [rescale_video_size(video, factor) for factor in size_factors],
+        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
+    elif sizes is not None:
+        inputs_per_video = [(
+            [prompt for _ in sizes],
+            [resize_video(video, size) for size in sizes],
+        ) for video, prompt in zip(videos, HF_VIDEO_PROMPTS)]
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
+
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=4096,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_video = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                videos=videos)
+            for prompts, videos in inputs_per_video
+        ]
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
+        hf_outputs_per_video = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    videos=videos)
+            for prompts, videos in inputs_per_video
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_video,
+                                        vllm_outputs_per_video):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    reason="Waiting for next transformers release")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No video
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_frames", [16])
+def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs, num_frames) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/videos.
+    For huggingface runner, we provide the np.ndarray as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    run_test(
+        hf_runner,
+        vllm_runner,
+        video_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        num_frames=num_frames,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.skipif(transformers.__version__ < "4.45",
+                    reason="Waiting for next transformers release")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "sizes",
+    [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_frames", [16])
+def test_models_fixed_sizes(hf_runner, vllm_runner, video_assets, model, sizes,
+                            dtype, max_tokens, num_logprobs,
+                            num_frames) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        video_assets,
+        model,
+        sizes=sizes,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        num_frames=num_frames,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -41,3 +41,43 @@ def test_models(
        name_0="hf",
        name_1="vllm",
    )
+
+
+@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_mistral_format(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="auto",
+            load_format="safetensors",
+            config_format="hf",
+    ) as hf_format_model:
+        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
+    ) as mistral_format_model:
+        mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_format_outputs,
+        outputs_1_lst=mistral_format_outputs,
+        name_0="hf",
+        name_1="mistral",
+    )
--- a/tests/models/test_modelopt.py
+++ b/tests/models/test_modelopt.py
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
--- a/tests/models/test_pixtral.py
+++ b/tests/models/test_pixtral.py
--- a/tests/models/test_qwen.py
+++ b/tests/models/test_qwen.py
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
 import pytest
+import transformers

 from vllm.model_executor.models import _MODELS, ModelRegistry


 @pytest.mark.parametrize("model_cls", _MODELS)
 def test_registry_imports(model_cls):
+    if (model_cls == "Qwen2VLForConditionalGeneration"
+            and transformers.__version__ < "4.45"):
+        pytest.skip("Waiting for next transformers release")
+
    # Ensure all model classes can be imported successfully
    ModelRegistry.resolve_model_cls([model_cls])
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -57,7 +57,7 @@ def test_multi_step_llm(
                           GPU -> CPU output transfer
      num_prompts: number of example prompts under test
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
+                    completions endpoint; `None` -> 1 logprob returned.
    """

    prompts = example_prompts

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py