Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

af7f4372 · zhuwenwen · 5e19cdef · 09c77926 · af7f4372 · af7f4372
Commit af7f4372 authored Sep 03, 2024 by zhuwenwen
20 changed files
--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_variation.py
@@ -20,12 +20,12 @@ from vllm.triton_utils.libentry import LibEntry
 from .utils import (generate_data, generate_data_for_expand_nslices,
                    ref_torch_groupgemm)
-HIDDEN_SIZES = [3424, 4096, 4097]
+HIDDEN_SIZES = [4097]
 BATCHES = [1, 4, 16, 32]
-NUM_LORA = [1, 4, 8, 16, 32, 64, 128]
+NUM_LORA = [1, 8, 32, 128]
 DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [1, 4, 8, 16, 32, 64, 128]
+MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
 SCALES = [0.5]
 SEED = [0]
 CUDA_DEVICES = [f"cuda:{0}"]
@@ -321,22 +321,3 @@ def test_punica_expand_nslices(
        slice_offset += hidden_size
    assert_close(our_outputs, ref_outputs)
-if __name__ == "__main__":
-    from itertools import product
-    lst = list(
-        product(
-            BATCHES,
-            NUM_LORA,
-            MAX_RANKS,
-            [1.0],
-            [torch.float16],
-            ["expand"],
-            SEED,
-            CUDA_DEVICES,
-        ))
-    for ele in lst:
-        test_punica_bgmv(*ele)
-        print(f"{ele},pass")
--- a/tests/models/test_bart.py
+++ b/tests/models/test_bart.py
+"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
+Run `pytest tests/models/test_bart.py`.
+"""
+from typing import List, Optional, Tuple
+from vllm.utils import is_cpu
+if not is_cpu():
+    # CPU backend is not currently supported with encoder/decoder models
+    # skip test definitions entirely to avoid importing GPU kernel libs
+    # (xFormers, etc.)
+    import pytest
+    from transformers import AutoModelForSeq2SeqLM
+    from vllm.sequence import SampleLogprobs
+    from ..conftest import DecoderPromptType
+    from .utils import check_logprobs_close
+    MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
+    def vllm_to_hf_output(
+        vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+        decoder_prompt_type: DecoderPromptType,
+    ):
+        """Sanitize vllm output to be comparable with hf output."""
+        output_ids, output_str, out_logprobs = vllm_output
+        hf_output_str = output_str + "</s>"
+        if decoder_prompt_type == DecoderPromptType.NONE:
+            hf_output_str = "<s>" + hf_output_str
+        return output_ids, hf_output_str, out_logprobs
+    @pytest.mark.parametrize("model", MODELS)
+    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+    @pytest.mark.parametrize("max_tokens", [64])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+    def test_models(
+        hf_runner,
+        vllm_runner,
+        example_encoder_decoder_prompts,
+        model: str,
+        dtype: str,
+        max_tokens: int,
+        num_logprobs: int,
+        decoder_prompt_type: DecoderPromptType,
+    ) -> None:
+        '''
+        Test the vLLM BART model for a variety of encoder/decoder input prompts,
+        by validating it against HuggingFace (HF) BART.
+        Arguments:
+        * hf_runner: HuggingFace (HF) test model runner
+        * vllm_runner: vLLM test model runner
+        * example_encoder_decoder_prompts: test fixture which provides a 
+                                           dictionary of dummy prompts
+        * model: the HF ID of the specific BART variant under test
+        * dtype: the tensor datatype to employ
+        * max_tokens
+        * num_logprobs
+        * decoder_prompt_type: key into the example_encoder_decoder_prompts
+                               dictionary; selects specific encoder/decoder
+                               prompt scenarios to test
+        A note on using HF BART as a baseline for validating vLLM BART,
+        specifically when the decoder prompt is None. 
+        The HF GenerationMixin's default behavior is to force the first
+        decoded token to be <BOS> if the prompt does not already contain
+        <BOS> (this is accomplished using a logit
+        processor setting.)
+        So when we use HF BART as our baseline for comparison, note that
+        when the user provides a request with a None decoder prompt
+        (i.e. a singleton encoder prompt, or else an explicit encoder/
+        decoder prompt with the decoder sub-prompt set to None), HF and
+        vLLM handle this in different ways:
+        * HF will (1) tokenize the None prompt as an empty token-list, 
+          (2) append <decoder-start-token> to the beginning, yielding
+          [<decoder-start-token>], (3) pass this token list to the model, and
+          then (4) after computing logits during prefill, override the model
+          logits & force <BOS> to be the first generated token.
+        * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
+          start-token to the beginning, yielding [<decoder-start-token><BOS>],
+          (3) pass these tokens to the model & proceed with generation.
+        The net effect is that compared to vLLM, the list of HF *decoded* tokens
+        will contain one more initial <BOS> than the vLLM generated tokens,
+        because vLLM's <BOS> token is injected into the prompt rather than into
+        the generated output. This is in spite of the fact that overall, the
+        complete sequences (prompt + decoded tokens) produced by vLLM will match
+        HF.
+        So when we use HF decoded token output to validate vLLM's decoded token
+        output, the testing process must account for the difference in decoded
+        token sequences between vLLM and HF specifically in the
+        decoder-prompt-is-None case. 
+        One option is to disable the logit processor feature that forces the
+        <BOS> token to be decoded (forced_bos_token_id = None), eliminating
+        the problem entirely. However this is not "normal" BART usage.
+        The other option is - only in the decoder-prompt-is-None case - to
+        discard the first decoded token from the HF output before comparing it
+        to vLLM.
+        To that end, when testing the scenario where the decoder prompt is None
+        (and only in that one scenario), this test skips the first HF decoded
+        token during the process of validating the vLLM decoded output.
+        '''
+        test_case_prompts = example_encoder_decoder_prompts[
+            decoder_prompt_type]
+        # Configuration settings for HF baseline
+        hf_kwargs = {
+            "top_k": None,
+            "num_beams": 1,
+            "repetition_penalty": 1.0,
+            "top_p": 1.0,
+            "length_penalty": 1.0,
+            "early_stopping": False,
+            "no_repeat_ngram_size": None,
+            "min_length": 0
+        }
+        with hf_runner(model, dtype=dtype,
+                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+            hf_outputs = (
+                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                    test_case_prompts,
+                    max_tokens,
+                    num_logprobs,
+                    **hf_kwargs,
+                ))
+        # Note: currently encoder/decoder models are only compatible with
+        # enforce_eager=True. Normally this is not a problem because
+        # for encoder/decoder models vLLM will
+        # default to enforce_eager=True if enforce_eager
+        # is left unspecified. However, the
+        # VllmRunner test fixture (which wraps around the LLM class) defaults to
+        # enforce_eager=False (a behavior which a number of already-exisitng
+        # decoder-only unit tests expect), so when testing an encoder/decoder
+        # model we must explicitly specify enforce_eager=True in the VllmRunner
+        # constructor.
+        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                test_case_prompts, max_tokens, num_logprobs)
+        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
+                          else 0)
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, decoder_prompt_type)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            num_outputs_0_skip_tokens=hf_skip_tokens,
+        )
--- a/tests/models/test_blip2.py
+++ b/tests/models/test_blip2.py
 from typing import List, Optional, Tuple
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -59,7 +59,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -80,7 +80,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
            for prompts, images in inputs_per_image
        ]
-    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,

--- a/tests/models/test_chameleon.py
+++ b/tests/models/test_chameleon.py
-import re
 from typing import List, Optional, Type
 import pytest
+from transformers import AutoModelForVision2Seq, BatchEncoding
 from vllm.multimodal.utils import rescale_image_size
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from ..conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_outputs_equal
 pytestmark = pytest.mark.vlm
@@ -19,9 +21,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
 models = ["facebook/chameleon-7b"]
-#TODO (ywang96): Add correctness test when chameleon is
-# available on transformers.
 def run_test(
+    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    image_assets: _ImageAssets,
    model: str,
@@ -29,13 +30,20 @@ def run_test(
    size_factors: List[float],
    dtype: str,
    max_tokens: int,
+    num_logprobs: int,
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
-    """Test if the model can generate text given 
+    """Inference result should be the same between hf and vllm.
-    a batch of images and prompts.
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
    """
+    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
@@ -50,35 +58,49 @@ def run_test(
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
-        for prompts, images in inputs_per_image:
+        vllm_outputs_per_image = [
-            vllm_outputs = vllm_model.generate_greedy(prompts,
+            vllm_model.generate_greedy_logprobs(prompts,
-                                                      max_tokens,
+                                                max_tokens,
-                                                      images=images)
+                                                num_logprobs=num_logprobs,
-            for i in range(len(vllm_outputs)):
+                                                images=images)
+            for prompts, images in inputs_per_image
-                # format prompt back to original
+        ]
-                replacements = {
-                    "<racm3:break>": "",
+    def process(hf_inputs: BatchEncoding):
-                    "<eoss>": "",
+        hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
-                    "<reserved08706>": ""
+            .to(torch_dtype)  # type: ignore
-                }
+        return hf_inputs
-                pattern = '|'.join(replacements.keys())
-                vllm_result = re.sub(
+    with hf_runner(model,
-                    pattern,
+                   dtype=dtype,
-                    lambda match: replacements[match.group(0)],  #noqa B023
+                   postprocess_inputs=process,
-                    vllm_outputs[i][1])
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
-                vllm_result = vllm_result.replace("<image>", "", 1023)
+        hf_outputs_per_image = [
-                assert vllm_result[:len(prompts[i])] == prompts[i]
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
-                # assert at least 10 new characters are generated
+                                                    num_logprobs=num_logprobs,
-                # (to take stop token into account)
+                                                    images=images)
-                assert len(vllm_outputs[i][1]) - len(prompts[i]) > 10
+            for prompts, images in inputs_per_image
+        ]
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # HF Logprobs include image tokens, unlike vLLM, so we don't directly
+        # compare them
+        check_outputs_equal(
+            outputs_0_lst=[outputs[:2] for outputs in hf_outputs],
+            outputs_1_lst=[outputs[:2] for outputs in vllm_outputs],
+            name_0="hf",
+            name_1="vllm",
+        )
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
    "size_factors",
    [
+        # No image
+        [],
        # Single-scale
        [1.0],
        # Single-scale, batched
@@ -88,15 +110,18 @@ def run_test(
    ],
 )
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("max_tokens", [8])
-def test_models(vllm_runner, image_assets, model, size_factors, dtype: str,
+@pytest.mark.parametrize("num_logprobs", [5])
-                max_tokens: int) -> None:
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs) -> None:
    run_test(
+        hf_runner,
        vllm_runner,
        image_assets,
        model,
        size_factors=size_factors,
        dtype=dtype,
        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
    )
--- a/tests/models/test_fuyu.py
+++ b/tests/models/test_fuyu.py
@@ -49,7 +49,7 @@ def run_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """

--- a/tests/models/test_gguf.py
+++ b/tests/models/test_gguf.py
+"""
+Tests gguf models against unquantized models generations
+Note: To pass the test, quantization higher than Q4 should be used
+"""
+import os
+import pytest
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer
+from tests.quantization.utils import is_quant_method_supported
+from .utils import check_logprobs_close
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+MAX_MODEL_LEN = 1024
+# FIXME: Move this to confest
+MODELS = [
+    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+     hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+                     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
+    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+     hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
+                     filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
+    ("Qwen/Qwen2-1.5B-Instruct",
+     hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
+                     filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
+    ("Qwen/Qwen2-1.5B-Instruct",
+     hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
+                     filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
+]
+@pytest.mark.skipif(not is_quant_method_supported("gguf"),
+                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("tp_size", [1, 2])
+def test_models(
+    num_gpus_available,
+    vllm_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tp_size: int,
+) -> None:
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+    original_model, gguf_model = model
+    tokenizer = AutoTokenizer.from_pretrained(original_model)
+    messages = [[{
+        'role': 'user',
+        'content': prompt
+    }] for prompt in example_prompts]
+    example_prompts = tokenizer.apply_chat_template(messages,
+                                                    tokenize=False,
+                                                    add_generation_prompt=True)
+    # Run unquantized model.
+    with vllm_runner(model_name=original_model,
+                     dtype=dtype,
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=tp_size) as original_model:
+        original_outputs = original_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
+    # Run gguf model.
+    with vllm_runner(model_name=gguf_model,
+                     dtype=dtype,
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=tp_size) as gguf_model:
+        gguf_outputs = gguf_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=original_outputs,
+        outputs_1_lst=gguf_outputs,
+        name_0="original",
+        name_1="gguf",
+    )
--- a/tests/models/test_intern_vit.py
+++ b/tests/models/test_intern_vit.py
+from typing import Optional
+import pytest
+import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoModel, CLIPImageProcessor
+from vllm.model_executor.models.intern_vit import InternVisionModel
+from ..conftest import _ImageAssets, cleanup
+pytestmark = pytest.mark.vlm
+# we use snapshot_download to prevent conflicts between
+# dynamic_module and trust_remote_code for hf_runner
+DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
+models = [
+    snapshot_download("OpenGVLab/InternViT-300M-448px",
+                      allow_patterns=DOWNLOAD_PATTERN),
+    snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5",
+                      allow_patterns=DOWNLOAD_PATTERN),
+]
+def run_intern_vit_test(
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    dtype: str,
+    distributed_executor_backend: Optional[str] = None,
+):
+    img_processor = CLIPImageProcessor.from_pretrained(model)
+    images = [asset.pil_image for asset in image_assets]
+    pixel_values = [
+        img_processor(images, return_tensors='pt').pixel_values.to(dtype)
+        for images in images
+    ]
+    config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+    if not getattr(config, "norm_type", None):
+        config.norm_type = "rms_norm"
+    hf_model = AutoModel.from_pretrained(model,
+                                         torch_dtype=dtype,
+                                         trust_remote_code=True).to("cuda")
+    hf_outputs_per_image = [
+        hf_model(pixel_value.to("cuda")).last_hidden_state
+        for pixel_value in pixel_values
+    ]
+    vllm_model = InternVisionModel(config)
+    vllm_model.load_weights(hf_model.state_dict().items())
+    del hf_model
+    cleanup()
+    vllm_model = vllm_model.to("cuda", dtype)
+    vllm_outputs_per_image = [
+        vllm_model(pixel_values=pixel_value.to("cuda"))
+        for pixel_value in pixel_values
+    ]
+    del vllm_model
+    cleanup()
+    cos_similar = nn.CosineSimilarity(dim=-1)
+    for vllm_output, hf_output in zip(vllm_outputs_per_image,
+                                      hf_outputs_per_image):
+        assert cos_similar(vllm_output, hf_output).mean() > 0.99
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [torch.half])
+@torch.inference_mode()
+def test_models(dist_init, image_assets, model, dtype: str) -> None:
+    run_intern_vit_test(
+        image_assets,
+        model,
+        dtype=dtype,
+    )
--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
 import types
-from typing import List, Optional, Type
+from typing import List, Optional, Tuple, Type
 import pytest
 import torch
 from huggingface_hub import snapshot_download
 from PIL.Image import Image
+from transformers import AutoConfig
 from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
                                                 IMG_START,
@@ -26,10 +27,15 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
+DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
 models = [
-    snapshot_download("OpenGVLab/InternVL2-1B"),
+    snapshot_download("OpenGVLab/InternVL2-1B",
-    snapshot_download("OpenGVLab/InternVL2-2B"),
+                      allow_patterns=DOWNLOAD_PATTERN),
-    # snapshot_download("OpenGVLab/InternVL2-4B"),  # broken
+    snapshot_download("OpenGVLab/InternVL2-2B",
+                      allow_patterns=DOWNLOAD_PATTERN),
+    # Broken due to outdated implementation of Phi-3
+    # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
+    # snapshot_download("OpenGVLab/InternVL2-4B"),
 ]
@@ -41,8 +47,17 @@ class InternVLProcessor:
        self.tokenizer = hf_runner.tokenizer
        self.dtype = hf_runner.model.dtype
+        self.config = AutoConfig.from_pretrained(hf_runner.model_name)
+        self.vision_config = self.config.vision_config
+        self.use_thumbnail = self.config.use_thumbnail
+        self.min_num = self.config.min_dynamic_patch
+        self.max_num = self.config.max_dynamic_patch
+        self.image_size = self.vision_config.image_size
    def __call__(self, text: str, images: Image, **kwargs):
-        pixel_values = image_to_pixel_values(images).to(self.dtype)
+        pixel_values = image_to_pixel_values(images, self.image_size,
+                                             self.min_num, self.max_num,
+                                             self.use_thumbnail).to(self.dtype)
        num_patches_list = [pixel_values.shape[0]]
        for num_patches in num_patches_list:
            context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
@@ -102,7 +117,7 @@ def run_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -163,6 +178,74 @@ def run_test(
        )
+def run_awq_test(
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    models: Tuple[str, str],
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    source_model, quant_model = models
+    images = [asset.pil_image for asset in image_assets]
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(source_model,
+                     max_model_len=4096,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        source_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+    with vllm_runner(quant_model,
+                     quantization="awq",
+                     max_model_len=4096,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        quant_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs_per_image
+        ]
+    for source_outputs, quant_outputs in zip(source_outputs_per_image,
+                                             quant_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=source_outputs,
+            outputs_1_lst=quant_outputs,
+            name_0="source",
+            name_1="awq",
+        )
 target_dtype = "half"
 if is_cpu():
    target_dtype = "bfloat16"
@@ -199,3 +282,36 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
    )
+@pytest.mark.parametrize(
+    "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+@torch.inference_mode()
+def test_awq_models(vllm_runner, image_assets, models, size_factors,
+                    dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_awq_test(
+        vllm_runner,
+        image_assets,
+        models,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -6,9 +6,12 @@ from vllm.worker.model_runner import _get_graph_batch_size
 MODELS = ["ai21labs/Jamba-tiny-random"]
+# Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl
+# TODO: Fix this with trained model
+@pytest.mark.skip()
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [20])
+@pytest.mark.parametrize("max_tokens", [10])
 def test_models(
    hf_runner,
    vllm_runner,
@@ -17,8 +20,6 @@ def test_models(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    # To pass the small model tests, we need full precision.
-    assert dtype == "float"
    with hf_runner(model, dtype=dtype) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
@@ -36,8 +37,8 @@ def test_models(
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [20])
+@pytest.mark.parametrize("max_tokens", [5])
 def test_batching(
    vllm_runner,
    example_prompts,

--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
 from typing import List, Optional, Tuple, Type
 import pytest
-from transformers import AutoTokenizer
+from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
+                          BatchEncoding)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
@@ -18,9 +20,11 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "USER: <image>\nWhat is the season?\nASSISTANT:",
 })
-IMAGE_TOKEN_ID = 32000
+models = [
+    "llava-hf/llava-1.5-7b-hf",
-models = ["llava-hf/llava-1.5-7b-hf"]
+    # TODO: Get this model to produce meaningful output in vLLM
+    # "TIGER-Lab/Mantis-8B-siglip-llama3",
+]
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
@@ -29,12 +33,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
    tokenizer = AutoTokenizer.from_pretrained(model)
    eos_token_id = tokenizer.eos_token_id
    hf_output_ids = [
        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
    ]
    assert output_str[0] == " "
@@ -63,10 +70,21 @@ def run_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
+    # NOTE: For local use; this isn't tested in CI yet (see TODO above)
+    if model.startswith("TIGER-Lab/Mantis"):
+        from mantis.models.mllava import MLlavaProcessor
+        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
+        mantis_processor = MLlavaProcessor.from_pretrained(
+            model, torch_dtype=torch_dtype)
+        assert isinstance(mantis_processor, MLlavaProcessor)
+    else:
+        mantis_processor = None
    images = [asset.pil_image for asset in image_assets]
    inputs_per_image = [(
@@ -93,7 +111,21 @@ def run_test(
            for prompts, images in inputs_per_image
        ]
-    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+    if mantis_processor is not None:
+        def process(hf_inputs: BatchEncoding):
+            hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \
+                .to(torch_dtype)  # type: ignore
+            return hf_inputs
+    else:
+        def process(hf_inputs: BatchEncoding):
+            return hf_inputs
+    with hf_runner(model,
+                   dtype=dtype,
+                   postprocess_inputs=process,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,

--- a/tests/models/test_llava_image_embeds.py
+++ b/tests/models/test_llava_image_embeds.py
+from typing import List, Optional, Tuple, Type
+import pytest
+from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
+from vllm.sequence import SampleLogprobs
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+pytestmark = pytest.mark.vlm
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+    "cherry_blossom":
+    "USER: <image>\nWhat is the season?\nASSISTANT:",
+})
+models = [
+    "llava-hf/llava-1.5-7b-hf",
+]
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
+    ]
+    assert output_str[0] == " "
+    hf_output_str = output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+    return hf_output_ids, hf_output_str, out_logprobs
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # vLLM to load from image embeddings
+    vllm_images = [asset.image_embeds for asset in image_assets]
+    # transformers to load from PIL images
+    hf_images = [asset.pil_image for asset in image_assets]
+    vllm_inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [image for _ in size_factors],
+    ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)]
+    hf_inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [image for _ in size_factors],
+    ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)]
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in vllm_inputs_per_image
+        ]
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images)
+            for prompts, images in hf_inputs_per_image
+        ]
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
 from typing import List, Optional, Tuple, Type, overload
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -23,8 +23,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
 })
-IMAGE_TOKEN_ID = 32000
 models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
@@ -34,12 +32,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
    tokenizer = AutoTokenizer.from_pretrained(model)
    eos_token_id = tokenizer.eos_token_id
    hf_output_ids = [
        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
    ]
    assert output_str[0] == " "
@@ -128,7 +129,8 @@ def run_test(
            for prompts, images in inputs_per_image
        ]
-    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
@@ -176,7 +178,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """

--- a/tests/models/test_minicpmv.py
+++ b/tests/models/test_minicpmv.py
-from collections import UserDict
 from typing import List, Optional, Tuple, Type
 import pytest
 import torch
 import torch.types
-from transformers import BatchFeature
+from transformers import BatchEncoding
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -14,18 +13,6 @@ from .utils import check_logprobs_close
 pytestmark = pytest.mark.vlm
-class NestedInputs(UserDict):
-    def __init__(self, model_inputs: BatchFeature):
-        super().__init__({"model_inputs": model_inputs})
-        self.model_inputs = model_inputs
-    def to(self, device: torch.types.Device):
-        return NestedInputs(self.model_inputs.to(device))
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -41,6 +28,10 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
 models = ["openbmb/MiniCPM-Llama3-V-2_5"]
+def _wrap_inputs(hf_inputs: BatchEncoding) -> BatchEncoding:
+    return BatchEncoding({"model_inputs": hf_inputs})
 def trunc_hf_output(hf_output: Tuple[List[int], str,
                                     Optional[SampleLogprobs]]):
    output_ids, output_str, out_logprobs = hf_output
@@ -70,7 +61,7 @@ def run_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -105,11 +96,8 @@ def run_test(
            for prompts, images in inputs_per_image
        ]
-    with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
+    hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
-        hf_processor = hf_model.processor
+    with hf_model, torch.no_grad():
-        hf_model.processor = lambda **kw: NestedInputs(
-            hf_processor(**kw)  # type: ignore
-        )
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,
@@ -188,7 +176,7 @@ def run_multi_image_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -209,6 +197,7 @@ def run_multi_image_test(
    with vllm_runner(model,
                     max_model_len=4096,
                     max_num_seqs=1,
+                     limit_mm_per_prompt={"image": len(images)},
                     dtype=dtype,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
@@ -224,11 +213,8 @@ def run_multi_image_test(
            for prompts, images in inputs_per_case
        ]
-    with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
+    hf_model = hf_runner(model, dtype=dtype, postprocess_inputs=_wrap_inputs)
-        hf_processor = hf_model.processor
+    with hf_model, torch.no_grad():
-        hf_model.processor = lambda **kw: NestedInputs(
-            hf_processor(**kw)  # type: ignore
-        )
        hf_outputs_per_case = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,

--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
-import torch
+import os
-from vllm import LLM, ModelRegistry, SamplingParams
+import pytest
-from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm import LLM, SamplingParams
-class MyOPTForCausalLM(OPTForCausalLM):
+from ..utils import fork_new_process_for_each_test
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
-        logits.zero_()
-        logits[:, 0] += 1.0
-        return logits
+@fork_new_process_for_each_test
+def test_plugin(dummy_opt_path):
+    os.environ["VLLM_PLUGINS"] = ""
+    with pytest.raises(Exception) as excinfo:
+        LLM(model=dummy_opt_path, load_format="dummy")
+    assert "are not supported for now" in str(excinfo.value)
-def test_oot_registration():
-    # register our dummy model
+@fork_new_process_for_each_test
-    ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
+def test_oot_registration(dummy_opt_path):
+    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
    prompts = ["Hello, my name is", "The text does not matter"]
    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model="facebook/opt-125m")
+    llm = LLM(model=dummy_opt_path, load_format="dummy")
    first_token = llm.get_tokenizer().decode(0)
    outputs = llm.generate(prompts, sampling_params)

--- a/tests/models/test_paligemma.py
+++ b/tests/models/test_paligemma.py
@@ -2,7 +2,7 @@ import os
 from typing import List, Optional, Tuple, Type
 import pytest
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -20,8 +20,6 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "What is in the picture?",
 })
-IMAGE_TOKEN_ID = 257152
 models = ["google/paligemma-3b-mix-224"]
 # ROCm Triton FA can run into compilation issues with these models due to,
@@ -37,12 +35,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output
+    config = AutoConfig.from_pretrained(model)
+    image_token_id = config.image_token_index
    tokenizer = AutoTokenizer.from_pretrained(model)
    eos_token_id = tokenizer.eos_token_id
    hf_output_ids = [
        token_id for idx, token_id in enumerate(output_ids)
-        if token_id != IMAGE_TOKEN_ID or output_ids[idx - 1] != IMAGE_TOKEN_ID
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
    ]
    hf_output_str = output_str
@@ -71,7 +72,7 @@ def run_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -101,7 +102,8 @@ def run_test(
            for prompts, images in inputs_per_image
        ]
-    with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,
                                                    max_tokens,

--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -21,7 +21,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
 })
-models = ["microsoft/Phi-3-vision-128k-instruct"]
+models = ["microsoft/Phi-3.5-vision-instruct"]
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
@@ -73,7 +73,7 @@ def run_test(
    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
+    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -81,7 +81,10 @@ def run_test(
    inputs_per_image = [(
        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
+        [
+            rescale_image_size(image, factor, transpose=idx)
+            for idx, factor in enumerate(size_factors)
+        ],
    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
    # NOTE: take care of the order. run vLLM first, and then run HF.

--- a/tests/models/test_qwen.py
+++ b/tests/models/test_qwen.py
+from typing import Type
+import pytest
+from ..conftest import HfRunner, VllmRunner
+from .utils import check_logprobs_close
+models = ["qwen/qwen-vl"]
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("model", models)
+def test_text_only_qwen_model(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    example_prompts,
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    # This test checks language inputs only, since the visual component
+    # for qwen-vl is still unsupported in VLLM. In the near-future, the
+    # implementation and this test will be extended to consider
+    # visual inputs as well.
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts,
+            max_tokens,
+            num_logprobs=num_logprobs,
+        )
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts,
+            max_tokens,
+            num_logprobs=num_logprobs,
+        )
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -6,4 +6,4 @@ from vllm.model_executor.models import _MODELS, ModelRegistry
 @pytest.mark.parametrize("model_cls", _MODELS)
 def test_registry_imports(model_cls):
    # Ensure all model classes can be imported successfully
-    ModelRegistry.load_model_cls(model_cls)
+    ModelRegistry.resolve_model_cls([model_cls])
--- a/tests/models/test_ultravox.py
+++ b/tests/models/test_ultravox.py
+from typing import List, Optional, Tuple, Type
+import librosa
+import numpy as np
+import pytest
+from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from vllm.assets.audio import AudioAsset
+from vllm.sequence import SampleLogprobs
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from ..conftest import HfRunner, VllmRunner
+from .utils import check_logprobs_close
+pytestmark = pytest.mark.vlm
+MODEL_NAME = "fixie-ai/ultravox-v0_3"
+AudioTuple = Tuple[np.ndarray, int]
+@pytest.fixture(scope="session")
+def audio_and_sample_rate():
+    return AudioAsset("mary_had_lamb").audio_and_sample_rate
+@pytest.fixture
+def prompts_and_audios(audio_and_sample_rate):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    vllm_placeholder = "<|reserved_special_token_0|>"
+    hf_placeholder = "<|audio|>"
+    question = "What's in the audio?"
+    vllm_prompt = tokenizer.apply_chat_template(
+        [{
+            'role': 'user',
+            'content': f"{vllm_placeholder}\n{question}"
+        }],
+        tokenize=False,
+        add_generation_prompt=True)
+    hf_prompt = tokenizer.apply_chat_template(
+        [{
+            'role': 'user',
+            'content': f"{hf_placeholder}\n{question}"
+        }],
+        tokenize=False,
+        add_generation_prompt=True)
+    return [(vllm_prompt, hf_prompt, audio_and_sample_rate)]
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    eos_token_id = tokenizer.eos_token_id
+    hf_output_ids = output_ids[:]
+    hf_output_str = output_str
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
+    return hf_output_ids, hf_output_str, out_logprobs
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    prompts_and_audios: List[Tuple[str, str, AudioTuple]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm."""
+    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_audio = [
+            vllm_model.generate_greedy_logprobs([vllm_prompt],
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                audios=[audio])
+            for vllm_prompt, _, audio in prompts_and_audios
+        ]
+    def process(hf_inputs: BatchEncoding):
+        hf_inputs["audio_values"] = hf_inputs["audio_values"] \
+            .to(torch_dtype)  # type: ignore
+        return hf_inputs
+    with hf_runner(model,
+                   dtype=dtype,
+                   postprocess_inputs=process,
+                   auto_cls=AutoModel) as hf_model:
+        hf_outputs_per_audio = [
+            hf_model.generate_greedy_logprobs_limit(
+                [hf_prompt],
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=[(librosa.resample(audio[0],
+                                          orig_sr=audio[1],
+                                          target_sr=16000), 16000)])
+            for _, hf_prompt, audio in prompts_and_audios
+        ]
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio,
+                                        vllm_outputs_per_audio):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, model)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+        )
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, prompts_and_audios, dtype: str,
+                max_tokens: int, num_logprobs: int) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        prompts_and_audios,
+        MODEL_NAME,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -45,11 +45,27 @@ def check_logprobs_close(
    outputs_1_lst: Sequence[TokensTextLogprobs],
    name_0: str,
    name_1: str,
+    num_outputs_0_skip_tokens: int = 0,
    warn_on_mismatch: bool = True,
 ):
    """
    Compare the logprobs of two sequences generated by different models,
    which should be similar but not necessarily equal.
+    Arguments:
+    * outputs_0_lst: First sequence to compare
+    * outputs_0_lst: Second sequence to compare
+    * name_0: sequence #0 name
+    * name_1: sequence #1 name
+    * num_outputs_0_skip_tokens: If > 0, specifies the number of initial
+                                 sequence #0 tokens & logprobs to discard
+                                 before comparison, i.e. all
+                                 of sequence #1 will be compared to
+                                 sequence #0 beginning at index
+                                 num_outputs_0_skip_tokens
+    * warn_on_mismatch: Issue a warning if there is token-wise or text-wise
+                        mismatch between the two sequences
    """
    assert len(outputs_0_lst) == len(outputs_1_lst)
@@ -65,6 +81,15 @@ def check_logprobs_close(
        if logprobs_1 is None:
            logprobs_1 = [None] * len(output_ids_1)
+        # Skip specified number of initial sequence #0 tokens
+        # & logprobs, leaving output text as-is for simplicity
+        # (text mismatches may generate warnings but do not
+        # cause the test to fail.)
+        if num_outputs_0_skip_tokens < 0:
+            raise ValueError("num_outputs_0_skip_tokens must be non-negative")
+        output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:]
+        logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
        # Loop through generated tokens.
        for idx, (output_id_0,
                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):