Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

6d2051cc · zhuwenwen · 2c7f740a · a2c71c54 · 6d2051cc · 6d2051cc
Commit 6d2051cc authored Oct 21, 2024 by zhuwenwen
20 changed files
--- a/tests/models/decoder_only/vision_language/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/test_qwen.py
@@ -5,7 +5,7 @@ import pytest
 import torch
 from PIL.Image import Image
-from vllm.inputs import InputContext, LLMInputs
+from vllm.inputs import InputContext, token_inputs
 from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
@@ -71,12 +71,12 @@ def test_input_processor_valid_mm_data(input_processor_for_qwen,
    """Happy cases for image inputs to Qwen's multimodal input processor."""
    prompt = "".join(
        [f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
-    inputs = LLMInputs(
+    inputs = token_inputs(
        prompt=prompt,
        # When processing multimodal data for a multimodal model, the qwen
        # input processor will overwrite the provided prompt_token_ids with
        # the image prompts
-        prompt_token_ids=None,
+        prompt_token_ids=[],
        multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
    )
    proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
@@ -134,9 +134,9 @@ def test_input_processor_invalid_mm_data(input_processor_for_qwen,
                                     trust_remote_code=True)
    prompt = "Picture 1: <img></img>\n"
    prompt_token_ids = tokenizer.encode(prompt)
-    inputs = LLMInputs(prompt=prompt,
+    inputs = token_inputs(prompt=prompt,
-                       prompt_token_ids=prompt_token_ids,
+                          prompt_token_ids=prompt_token_ids,
-                       multi_modal_data=mm_data)
+                          multi_modal_data=mm_data)
    # Should fail since we have too many or too few dimensions for embeddings
    with pytest.raises(ValueError):
        input_processor_for_qwen(qwen_vl_context, inputs)
@@ -221,7 +221,7 @@ def run_test(
    # Qwen encodes each image into a fixed content size of 256
    with vllm_runner(model,
                     max_model_len=1024,
-                     max_num_seqs=1,
+                     max_num_seqs=2,
                     dtype=dtype,
                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,

--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
-"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
+"""Compare the embedding outputs of HF and vLLM models.
-Run `pytest tests/models/test_llama_embedding.py`.
+Run `pytest tests/models/embedding/language/test_embedding.py`.
 """
 import pytest
-import torch
-import torch.nn.functional as F
+from ..utils import check_embeddings_close
 MODELS = [
    "intfloat/e5-mistral-7b-instruct",
+    "BAAI/bge-multilingual-gemma2",
 ]
-def compare_embeddings(embeddings1, embeddings2):
-    similarities = [
-        F.cosine_similarity(torch.tensor(e1), torch.tensor(e2), dim=0)
-        for e1, e2 in zip(embeddings1, embeddings2)
-    ]
-    return similarities
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models(
@@ -28,15 +21,25 @@ def test_models(
    model: str,
    dtype: str,
 ) -> None:
-    with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+    with hf_runner(model, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_outputs = vllm_model.encode(example_prompts)
-    similarities = compare_embeddings(hf_outputs, vllm_outputs)
+    check_embeddings_close(
-    all_similarities = torch.stack(similarities)
+        embeddings_0_lst=hf_outputs,
-    tolerance = 1e-2
+        embeddings_1_lst=vllm_outputs,
-    assert torch.all((all_similarities <= 1.0 + tolerance)
+        name_0="hf",
-                     & (all_similarities >= 1.0 - tolerance)
+        name_1="vllm",
-                     ), f"Not all values are within {tolerance} of 1.0"
+        tol=1e-2,
+    )
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
+from typing import List, Sequence
+import torch
+import torch.nn.functional as F
+def check_embeddings_close(
+    *,
+    embeddings_0_lst: Sequence[List[float]],
+    embeddings_1_lst: Sequence[List[float]],
+    name_0: str,
+    name_1: str,
+    tol: float = 1e-3,
+) -> None:
+    assert len(embeddings_0_lst) == len(embeddings_1_lst)
+    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
+            zip(embeddings_0_lst, embeddings_1_lst)):
+        assert len(embeddings_0) == len(embeddings_1)
+        sim = F.cosine_similarity(torch.tensor(embeddings_0),
+                                  torch.tensor(embeddings_1),
+                                  dim=0)
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{embeddings_0!r}"
+                    f"\n{name_1}:\t{embeddings_1!r}")
+        assert sim >= 1 - tol, fail_msg
--- a/tests/models/embedding/vision_language/__init__.py
+++ b/tests/models/embedding/vision_language/__init__.py
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
+import pytest
+import torch.nn.functional as F
+from ....conftest import IMAGE_ASSETS
+from ..utils import check_embeddings_close
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
+    "cherry_blossom":
+    "<|image_1|> Represent the given image with the following question: What is in the image",  # noqa: E501
+})
+MODELS = ["TIGER-Lab/VLM2Vec-Full"]
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     max_model_len=4096,
+                     max_num_seqs=2,
+                     dtype=dtype,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+    with hf_runner(model, dtype=dtype) as hf_model:
+        all_inputs = hf_model.get_inputs(example_prompts)
+        all_outputs = []
+        for inputs in all_inputs:
+            # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
+            outputs = hf_model.model(
+                **hf_model.wrap_device(inputs,
+                                       device=hf_model.model.device.type),
+                return_dict=True,
+                output_hidden_states=True,
+            )
+            last_hidden_state = outputs.hidden_states[-1][0]
+            reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
+            pooled_output = F.normalize(reps, p=2, dim=-1)
+            all_outputs.append(pooled_output.tolist())
+        hf_outputs = all_outputs
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -4,220 +4,214 @@ Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
 """
 from typing import List, Optional, Tuple, Type
-from vllm.utils import is_cpu
+import pytest
+from transformers import AutoModelForSeq2SeqLM
-if not is_cpu():
-    # CPU backend is not currently supported with encoder/decoder models
+from vllm.sequence import SampleLogprobs
-    # skip test definitions entirely to avoid importing GPU kernel libs
-    # (xFormers, etc.)
+from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
+                          HfRunner, VllmRunner)
-    import pytest
+from ....utils import multi_gpu_test
-    from transformers import AutoModelForSeq2SeqLM
+from ...utils import check_logprobs_close
-    from vllm.sequence import SampleLogprobs
+MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
-    from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
-                              HfRunner, VllmRunner)
+def vllm_to_hf_output(
-    from ....utils import multi_gpu_test
+    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
-    from ...utils import check_logprobs_close
+    decoder_prompt_type: DecoderPromptType,
+):
-    MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
-    def vllm_to_hf_output(
-        vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+    hf_output_str = output_str + "</s>"
-        decoder_prompt_type: DecoderPromptType,
+    if decoder_prompt_type == DecoderPromptType.NONE:
-    ):
+        hf_output_str = "<s>" + hf_output_str
-        """Sanitize vllm output to be comparable with hf output."""
-        output_ids, output_str, out_logprobs = vllm_output
+    return output_ids, hf_output_str, out_logprobs
-        hf_output_str = output_str + "</s>"
-        if decoder_prompt_type == DecoderPromptType.NONE:
+def run_test(
-            hf_output_str = "<s>" + hf_output_str
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
-        return output_ids, hf_output_str, out_logprobs
+    prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+    decoder_prompt_type: DecoderPromptType,
-    def run_test(
+    model: str,
-        hf_runner: Type[HfRunner],
+    *,
-        vllm_runner: Type[VllmRunner],
+    dtype: str,
-        prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+    max_tokens: int,
-        decoder_prompt_type: DecoderPromptType,
+    num_logprobs: int,
-        model: str,
+    tensor_parallel_size: int,
-        *,
+    distributed_executor_backend: Optional[str] = None,
-        dtype: str,
+) -> None:
-        max_tokens: int,
+    '''
-        num_logprobs: int,
+    Test the vLLM BART model for a variety of encoder/decoder input prompts,
-        tensor_parallel_size: int,
+    by validating it against HuggingFace (HF) BART.
-        distributed_executor_backend: Optional[str] = None,
-    ) -> None:
+    Arguments:
-        '''
-        Test the vLLM BART model for a variety of encoder/decoder input prompts,
+    * hf_runner: HuggingFace (HF) test model runner
-        by validating it against HuggingFace (HF) BART.
+    * vllm_runner: vLLM test model runner
+    * example_encoder_decoder_prompts: test fixture which provides a 
-        Arguments:
+                                       dictionary of dummy prompts
+    * model: the HF ID of the specific BART variant under test
-        * hf_runner: HuggingFace (HF) test model runner
+    * dtype: the tensor datatype to employ
-        * vllm_runner: vLLM test model runner
+    * max_tokens
-        * example_encoder_decoder_prompts: test fixture which provides a 
+    * num_logprobs
-                                           dictionary of dummy prompts
+    * decoder_prompt_type: key into the example_encoder_decoder_prompts
-        * model: the HF ID of the specific BART variant under test
+                           dictionary; selects specific encoder/decoder
-        * dtype: the tensor datatype to employ
+                           prompt scenarios to test
-        * max_tokens
-        * num_logprobs
+    A note on using HF BART as a baseline for validating vLLM BART,
-        * decoder_prompt_type: key into the example_encoder_decoder_prompts
+    specifically when the decoder prompt is None. 
-                               dictionary; selects specific encoder/decoder
-                               prompt scenarios to test
+    The HF GenerationMixin's default behavior is to force the first
+    decoded token to be <BOS> if the prompt does not already contain
-        A note on using HF BART as a baseline for validating vLLM BART,
+    <BOS> (this is accomplished using a logit
-        specifically when the decoder prompt is None. 
+    processor setting.)
-        The HF GenerationMixin's default behavior is to force the first
+    So when we use HF BART as our baseline for comparison, note that
-        decoded token to be <BOS> if the prompt does not already contain
+    when the user provides a request with a None decoder prompt
-        <BOS> (this is accomplished using a logit
+    (i.e. a singleton encoder prompt, or else an explicit encoder/
-        processor setting.)
+    decoder prompt with the decoder sub-prompt set to None), HF and
+    vLLM handle this in different ways:
-        So when we use HF BART as our baseline for comparison, note that
-        when the user provides a request with a None decoder prompt
+    * HF will (1) tokenize the None prompt as an empty token-list, 
-        (i.e. a singleton encoder prompt, or else an explicit encoder/
+      (2) append <decoder-start-token> to the beginning, yielding
-        decoder prompt with the decoder sub-prompt set to None), HF and
+      [<decoder-start-token>], (3) pass this token list to the model, and
-        vLLM handle this in different ways:
+      then (4) after computing logits during prefill, override the model
+      logits & force <BOS> to be the first generated token.
-        * HF will (1) tokenize the None prompt as an empty token-list, 
-          (2) append <decoder-start-token> to the beginning, yielding
+    * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
-          [<decoder-start-token>], (3) pass this token list to the model, and
+      start-token to the beginning, yielding [<decoder-start-token><BOS>],
-          then (4) after computing logits during prefill, override the model
+      (3) pass these tokens to the model & proceed with generation.
-          logits & force <BOS> to be the first generated token.
+    The net effect is that compared to vLLM, the list of HF *decoded* tokens
-        * vLLM will (1) tokenize the None prompt as [<BOS>], (2) append decoder-
+    will contain one more initial <BOS> than the vLLM generated tokens,
-          start-token to the beginning, yielding [<decoder-start-token><BOS>],
+    because vLLM's <BOS> token is injected into the prompt rather than into
-          (3) pass these tokens to the model & proceed with generation.
+    the generated output. This is in spite of the fact that overall, the
+    complete sequences (prompt + decoded tokens) produced by vLLM will match
-        The net effect is that compared to vLLM, the list of HF *decoded* tokens
+    HF.
-        will contain one more initial <BOS> than the vLLM generated tokens,
-        because vLLM's <BOS> token is injected into the prompt rather than into
+    So when we use HF decoded token output to validate vLLM's decoded token
-        the generated output. This is in spite of the fact that overall, the
+    output, the testing process must account for the difference in decoded
-        complete sequences (prompt + decoded tokens) produced by vLLM will match
+    token sequences between vLLM and HF specifically in the
-        HF.
+    decoder-prompt-is-None case. 
-        So when we use HF decoded token output to validate vLLM's decoded token
+    One option is to disable the logit processor feature that forces the
-        output, the testing process must account for the difference in decoded
+    <BOS> token to be decoded (forced_bos_token_id = None), eliminating
-        token sequences between vLLM and HF specifically in the
+    the problem entirely. However this is not "normal" BART usage.
-        decoder-prompt-is-None case. 
+    The other option is - only in the decoder-prompt-is-None case - to
-        One option is to disable the logit processor feature that forces the
+    discard the first decoded token from the HF output before comparing it
-        <BOS> token to be decoded (forced_bos_token_id = None), eliminating
+    to vLLM.
-        the problem entirely. However this is not "normal" BART usage.
+    To that end, when testing the scenario where the decoder prompt is None
-        The other option is - only in the decoder-prompt-is-None case - to
+    (and only in that one scenario), this test skips the first HF decoded
-        discard the first decoded token from the HF output before comparing it
+    token during the process of validating the vLLM decoded output.
-        to vLLM.
+    '''
-        To that end, when testing the scenario where the decoder prompt is None
+    # NOTE: take care of the order. run vLLM first, and then run HF.
-        (and only in that one scenario), this test skips the first HF decoded
+    # vLLM needs a fresh new process without cuda initialization.
-        token during the process of validating the vLLM decoded output.
+    # if we run HF first, the cuda initialization will be done and it
-        '''
+    # will hurt multiprocessing backend with fork method (the default).
-        # NOTE: take care of the order. run vLLM first, and then run HF.
+    # Note: currently encoder/decoder models are only compatible with
-        # vLLM needs a fresh new process without cuda initialization.
+    # enforce_eager=True. Normally this is not a problem because
-        # if we run HF first, the cuda initialization will be done and it
+    # for encoder/decoder models vLLM will
-        # will hurt multiprocessing backend with fork method (the default).
+    # default to enforce_eager=True if enforce_eager
+    # is left unspecified. However, the
-        # Note: currently encoder/decoder models are only compatible with
+    # VllmRunner test fixture (which wraps around the LLM class) defaults to
-        # enforce_eager=True. Normally this is not a problem because
+    # enforce_eager=False (a behavior which a number of already-exisitng
-        # for encoder/decoder models vLLM will
+    # decoder-only unit tests expect), so when testing an encoder/decoder
-        # default to enforce_eager=True if enforce_eager
+    # model we must explicitly specify enforce_eager=True in the VllmRunner
-        # is left unspecified. However, the
+    # constructor.
-        # VllmRunner test fixture (which wraps around the LLM class) defaults to
+    with vllm_runner(model,
-        # enforce_eager=False (a behavior which a number of already-exisitng
+                     dtype=dtype,
-        # decoder-only unit tests expect), so when testing an encoder/decoder
+                     tensor_parallel_size=tensor_parallel_size,
-        # model we must explicitly specify enforce_eager=True in the VllmRunner
+                     distributed_executor_backend=distributed_executor_backend,
-        # constructor.
+                     enforce_eager=True) as vllm_model:
-        with vllm_runner(
+        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-                model,
+            prompts, max_tokens, num_logprobs)
-                dtype=dtype,
-                tensor_parallel_size=tensor_parallel_size,
+    # Configuration settings for HF baseline
-                distributed_executor_backend=distributed_executor_backend,
+    hf_kwargs = {
-                enforce_eager=True) as vllm_model:
+        "top_k": None,
-            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+        "num_beams": 1,
-                prompts, max_tokens, num_logprobs)
+        "repetition_penalty": 1.0,
+        "top_p": 1.0,
-        # Configuration settings for HF baseline
+        "length_penalty": 1.0,
-        hf_kwargs = {
+        "early_stopping": False,
-            "top_k": None,
+        "no_repeat_ngram_size": None,
-            "num_beams": 1,
+        "min_length": 0
-            "repetition_penalty": 1.0,
+    }
-            "top_p": 1.0,
-            "length_penalty": 1.0,
+    with hf_runner(model, dtype=dtype,
-            "early_stopping": False,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-            "no_repeat_ngram_size": None,
+        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            "min_length": 0
+            prompts,
-        }
+            max_tokens,
+            num_logprobs,
-        with hf_runner(model, dtype=dtype,
+            **hf_kwargs,
-                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        ))
-            hf_outputs = (
-                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+    hf_skip_tokens = (1
-                    prompts,
+                      if decoder_prompt_type == DecoderPromptType.NONE else 0)
-                    max_tokens,
-                    num_logprobs,
+    check_logprobs_close(
-                    **hf_kwargs,
+        outputs_0_lst=hf_outputs,
-                ))
+        outputs_1_lst=[
+            vllm_to_hf_output(vllm_output, decoder_prompt_type)
-        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
+            for vllm_output in vllm_outputs
-                          else 0)
+        ],
+        name_0="hf",
-        check_logprobs_close(
+        name_1="vllm",
-            outputs_0_lst=hf_outputs,
+        num_outputs_0_skip_tokens=hf_skip_tokens,
-            outputs_1_lst=[
+    )
-                vllm_to_hf_output(vllm_output, decoder_prompt_type)
-                for vllm_output in vllm_outputs
-            ],
+@pytest.mark.parametrize("model", MODELS)
-            name_0="hf",
+@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
-            name_1="vllm",
+@pytest.mark.parametrize("max_tokens", [64])
-            num_outputs_0_skip_tokens=hf_skip_tokens,
+@pytest.mark.parametrize("num_logprobs", [5])
-        )
+@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
-    @pytest.mark.parametrize("model", MODELS)
+                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
-    @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
-    @pytest.mark.parametrize("max_tokens", [64])
+    run_test(
-    @pytest.mark.parametrize("num_logprobs", [5])
+        hf_runner,
-    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+        vllm_runner,
-    def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
+        example_encoder_decoder_prompts[decoder_prompt_type],
-                    model, dtype, max_tokens, num_logprobs,
+        decoder_prompt_type,
-                    decoder_prompt_type) -> None:
+        model,
+        dtype=dtype,
-        run_test(
+        max_tokens=max_tokens,
-            hf_runner,
+        num_logprobs=num_logprobs,
-            vllm_runner,
+        tensor_parallel_size=1,
-            example_encoder_decoder_prompts[decoder_prompt_type],
+    )
-            decoder_prompt_type,
-            model,
-            dtype=dtype,
+@multi_gpu_test(num_gpus=2)
-            max_tokens=max_tokens,
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-            num_logprobs=num_logprobs,
+@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
-            tensor_parallel_size=1,
+@pytest.mark.parametrize("dtype", ["float"])
-        )
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
-    @multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
-    @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+def test_models_distributed(hf_runner, vllm_runner,
-    @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+                            example_encoder_decoder_prompts,
-    @pytest.mark.parametrize("dtype", ["float"])
+                            distributed_executor_backend, model, dtype,
-    @pytest.mark.parametrize("max_tokens", [64])
+                            max_tokens, num_logprobs,
-    @pytest.mark.parametrize("num_logprobs", [5])
+                            decoder_prompt_type) -> None:
-    @pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
+    run_test(
-    def test_models_distributed(hf_runner, vllm_runner,
+        hf_runner,
-                                example_encoder_decoder_prompts,
+        vllm_runner,
-                                distributed_executor_backend, model, dtype,
+        example_encoder_decoder_prompts[decoder_prompt_type],
-                                max_tokens, num_logprobs,
+        decoder_prompt_type,
-                                decoder_prompt_type) -> None:
+        model,
-        run_test(
+        dtype=dtype,
-            hf_runner,
+        max_tokens=max_tokens,
-            vllm_runner,
+        num_logprobs=num_logprobs,
-            example_encoder_decoder_prompts[decoder_prompt_type],
+        tensor_parallel_size=2,
-            decoder_prompt_type,
+        distributed_executor_backend=distributed_executor_backend,
-            model,
+    )
-            dtype=dtype,
-            max_tokens=max_tokens,
-            num_logprobs=num_logprobs,
-            tensor_parallel_size=2,
-            distributed_executor_backend=distributed_executor_backend,
-        )
--- a/tests/models/encoder_decoder/vision_language/test_broadcast.py
+++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py
+import pytest
+from ....utils import multi_gpu_test
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", [
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
+])
+def test_models(hf_runner, vllm_runner, image_assets,
+                distributed_executor_backend, model) -> None:
+    dtype = "half"
+    max_tokens = 5
+    num_logprobs = 5
+    tensor_parallel_size = 2
+    if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
+        from .test_mllama import models, run_test
+    else:
+        raise NotImplementedError(f"Unsupported model: {model}")
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model=models[0],
+        size_factors=[0.25, 0.5, 1.0],
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -9,10 +9,10 @@ from vllm.sequence import SampleLogprobs
 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
-from ....utils import multi_gpu_test
+from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
-_LIMIT_IMAGE_PER_PROMPT = 1
+_LIMIT_IMAGE_PER_PROMPT = 3
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -47,14 +47,46 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
    ]
-    assert output_str[0] == " "
+    hf_output_str = output_str
-    hf_output_str = output_str[1:]
    if hf_output_ids[-1] == eos_token_id:
        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
    return hf_output_ids, hf_output_str, out_logprobs
+def _get_inputs(
+    image_assets: _ImageAssets,
+    *,
+    size_factors: Optional[List[float]] = None,
+    sizes: Optional[List[Tuple[int, int]]] = None,
+) -> List[Tuple[List[str], PromptImageInput]]:
+    images = [asset.pil_image for asset in image_assets]
+    if size_factors is not None:
+        inputs_per_image = [(
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    elif sizes is not None:
+        inputs_per_image = [(
+            [
+                prompt if size is not None else text_only_prompts[0]
+                for size in sizes
+            ],
+            [
+                image.resize(size) if size is not None else None
+                for size in sizes
+            ],
+        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+        if len(sizes) == 0:
+            inputs_per_image.append(
+                (text_only_prompts, [None] * len(text_only_prompts)))
+    else:
+        raise ValueError("You must provide either `size_factors` or `sizes`")
+    return inputs_per_image
 @overload
 def run_test(
    hf_runner: Type[HfRunner],
@@ -103,39 +135,17 @@ def run_test(
    tensor_parallel_size: int,
    distributed_executor_backend: Optional[str] = None,
 ):
-    images = [asset.pil_image for asset in image_assets]
+    _run_test(
+        hf_runner,
-    if size_factors is not None:
+        vllm_runner,
-        inputs_per_image = [(
+        _get_inputs(image_assets, size_factors=size_factors, sizes=sizes),
-            [prompt for _ in size_factors],
+        model,
-            [rescale_image_size(image, factor) for factor in size_factors],
+        dtype=dtype,
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+        max_tokens=max_tokens,
-    elif sizes is not None:
+        num_logprobs=num_logprobs,
-        inputs_per_image = [(
+        tensor_parallel_size=tensor_parallel_size,
-            [
+        distributed_executor_backend=distributed_executor_backend,
-                prompt if size is not None else text_only_prompts[0]
+    )
-                for size in sizes
-            ],
-            [
-                image.resize(size) if size is not None else None
-                for size in sizes
-            ],
-        ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
-        if len(sizes) == 0:
-            inputs_per_image.append(
-                (text_only_prompts, [None] * len(text_only_prompts)))
-    else:
-        raise ValueError("You must provide either `size_factors` or `sizes`")
-    _run_test(hf_runner,
-              vllm_runner,
-              inputs_per_image,
-              model,
-              dtype=dtype,
-              max_tokens=max_tokens,
-              num_logprobs=num_logprobs,
-              tensor_parallel_size=tensor_parallel_size,
-              distributed_executor_backend=distributed_executor_backend)
 def _run_test(
@@ -167,8 +177,8 @@ def _run_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     dtype=dtype,
-                     max_num_seqs=16,
                     max_model_len=4096,
+                     max_num_seqs=2,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True,
@@ -185,14 +195,9 @@ def _run_test(
    def process(hf_inputs: BatchEncoding):
        return hf_inputs
-    from transformers import AutoConfig
-    from transformers.models.mllama import MllamaConfig as MllamaConfigHf
-    # use transformer's MllamaConfig for hf_runner
-    # and vllm's MllamaConfig for vllm_runner
-    AutoConfig.register("mllama", MllamaConfigHf, exist_ok=True)
    with hf_runner(model,
                   dtype=dtype,
+                   model_kwargs={"device_map": "auto"},
                   postprocess_inputs=process,
                   auto_cls=AutoModelForVision2Seq) as hf_model:
        hf_outputs_per_image = [
@@ -203,8 +208,6 @@ def _run_test(
            for prompts, images in inputs
        ]
-    from vllm.transformers_utils.configs.mllama import MllamaConfig
-    AutoConfig.register("mllama", MllamaConfig, exist_ok=True)
    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
                                        vllm_outputs_per_image):
        check_logprobs_close(
@@ -218,6 +221,7 @@ def _run_test(
        )
+@large_gpu_test(min_gb=48)
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
    "sizes",
@@ -236,13 +240,13 @@ def _run_test(
         (1024, 1024), (512, 1536), (512, 2028), None],
        # mllama has 8 possible aspect ratios, carefully set the sizes
        # to cover all of them
-    ],
+    ])
-)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, sizes, dtype,
+def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
-                max_tokens, num_logprobs) -> None:
+                                     model, sizes, dtype, max_tokens,
+                                     num_logprobs) -> None:
    run_test(
        hf_runner,
        vllm_runner,
@@ -256,28 +260,79 @@ def test_models(hf_runner, vllm_runner, image_assets, model, sizes, dtype,
    )
-@multi_gpu_test(num_gpus=2)
+@large_gpu_test(min_gb=48)
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize(
-    "sizes",
-    [
-        [(512, 512), (1024, 512), (1536, 512), (2048, 512), (512, 1024),
-         (1024, 1024), (512, 1536), (512, 2028), None],
-    ],
-)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models_distributed(hf_runner, vllm_runner, image_assets, model, sizes,
+def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
-                            dtype, max_tokens, num_logprobs) -> None:
+                                     model, dtype, max_tokens,
-    run_test(
+                                     num_logprobs) -> None:
+    stop_sign = image_assets[0].pil_image
+    cherry_blossom = image_assets[1].pil_image
+    inputs = [(
+        [
+            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
+            "<|image|><|image|><|begin_of_text|>Describe 2 images.",  # noqa: E501
+            "<|image|><|image|><|image|><|begin_of_text|>Describe 3 images.",  # noqa: E501
+        ],
+        [
+            [stop_sign, cherry_blossom],
+            # Images with different sizes.
+            [
+                stop_sign.resize((512, 512)),
+                stop_sign,
+            ],
+            [
+                stop_sign,
+                stop_sign.resize((512, 1536)),
+                cherry_blossom.resize((512, 1024)),
+            ],
+        ])]
+    _run_test(
        hf_runner,
        vllm_runner,
-        image_assets,
+        inputs,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
+                                   dtype, max_tokens, num_logprobs) -> None:
+    stop_sign = image_assets[0].pil_image
+    cherry_blossom = image_assets[1].pil_image
+    inputs = [(
+        [
+            "<|begin_of_text|>The content of the image <|image|> is",  # noqa: E501
+            "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, "  # noqa: E501
+            "which is a stop sign and which is a cherry blossom?",  # noqa: E501
+        ],
+        [
+            [stop_sign],
+            [stop_sign, cherry_blossom],
+        ])]
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        inputs,
        model,
-        sizes=sizes,
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
-        tensor_parallel_size=2,
+        tensor_parallel_size=1,
    )
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -2,7 +2,8 @@ import os
 import pytest
-from vllm import LLM, SamplingParams
+from vllm import LLM, PoolingParams, SamplingParams
+from vllm.assets.image import ImageAsset
 from ..utils import fork_new_process_for_each_test
@@ -16,7 +17,7 @@ def test_plugin(dummy_opt_path):
 @fork_new_process_for_each_test
-def test_oot_registration(dummy_opt_path):
+def test_oot_registration_text_generation(dummy_opt_path):
    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
    prompts = ["Hello, my name is", "The text does not matter"]
    sampling_params = SamplingParams(temperature=0)
@@ -29,3 +30,52 @@ def test_oot_registration(dummy_opt_path):
        # make sure only the first token is generated
        rest = generated_text.replace(first_token, "")
        assert rest == ""
+@fork_new_process_for_each_test
+def test_oot_registration_embedding(dummy_gemma2_embedding_path):
+    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    prompts = ["Hello, my name is", "The text does not matter"]
+    sampling_params = PoolingParams()
+    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+    outputs = llm.encode(prompts, sampling_params)
+    for output in outputs:
+        assert all(v == 0 for v in output.outputs.embedding)
+image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+@fork_new_process_for_each_test
+def test_oot_registration_multimodal(dummy_llava_path):
+    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
+    prompts = [{
+        "prompt": "What's in the image?<image>",
+        "multi_modal_data": {
+            "image": image
+        },
+    }, {
+        "prompt": "Describe the image<image>",
+        "multi_modal_data": {
+            "image": image
+        },
+    }]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(model=dummy_llava_path,
+              load_format="dummy",
+              max_num_seqs=1,
+              trust_remote_code=True,
+              gpu_memory_utilization=0.98,
+              max_model_len=4096,
+              enforce_eager=True,
+              limit_mm_per_prompt={"image": 1})
+    first_token = llm.get_tokenizer().decode(0)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        # make sure only the first token is generated
+        rest = generated_text.replace(first_token, "")
+        assert rest == ""
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
+import warnings
 import pytest
-import transformers
+import torch.cuda
-from vllm.model_executor.models import _MODELS, ModelRegistry
+from vllm.model_executor.models import (is_embedding_model,
+                                        is_text_generation_model,
+                                        supports_multimodal)
+from vllm.model_executor.models.registry import (_EMBEDDING_MODELS,
+                                                 _MULTIMODAL_MODELS,
+                                                 _SPECULATIVE_DECODING_MODELS,
+                                                 _TEXT_GENERATION_MODELS,
+                                                 ModelRegistry)
+from vllm.platforms import current_platform
+from ..utils import fork_new_process_for_each_test
-@pytest.mark.parametrize("model_cls", _MODELS)
-def test_registry_imports(model_cls):
-    if (model_cls in ("LlavaOnevisionForConditionalGeneration",
-                      "Qwen2VLForConditionalGeneration")
-            and transformers.__version__ < "4.45"):
-        pytest.skip("Waiting for next transformers release")
+@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
+def test_registry_imports(model_arch):
    # Ensure all model classes can be imported successfully
-    ModelRegistry.resolve_model_cls([model_cls])
+    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
+    if model_arch in _SPECULATIVE_DECODING_MODELS:
+        pass  # Ignore these models which do not have a unified format
+    else:
+        assert is_text_generation_model(model_cls) is (
+            model_arch in _TEXT_GENERATION_MODELS
+            or model_arch in _MULTIMODAL_MODELS)
+        assert is_embedding_model(model_cls) is (model_arch
+                                                 in _EMBEDDING_MODELS)
+        assert supports_multimodal(model_cls) is (model_arch
+                                                  in _MULTIMODAL_MODELS)
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("model_arch,is_mm,init_cuda", [
+    ("LlamaForCausalLM", False, False),
+    ("MllamaForConditionalGeneration", True, False),
+    ("LlavaForConditionalGeneration", True, True),
+])
+def test_registry_is_multimodal(model_arch, is_mm, init_cuda):
+    assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
+    if init_cuda and current_platform.is_cuda_alike():
+        assert not torch.cuda.is_initialized()
+        ModelRegistry.resolve_model_cls(model_arch)
+        if not torch.cuda.is_initialized():
+            warnings.warn(
+                "This model no longer initializes CUDA on import. "
+                "Please test using a different one.",
+                stacklevel=2)
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
+    ("MLPSpeculatorPreTrainedModel", False, False),
+    ("DeepseekV2ForCausalLM", True, False),
+    ("Qwen2VLForConditionalGeneration", True, True),
+])
+def test_registry_is_pp(model_arch, is_pp, init_cuda):
+    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
+    if init_cuda and current_platform.is_cuda_alike():
+        assert not torch.cuda.is_initialized()
+        ModelRegistry.resolve_model_cls(model_arch)
+        if not torch.cuda.is_initialized():
+            warnings.warn(
+                "This model no longer initializes CUDA on import. "
+                "Please test using a different one.",
+                stacklevel=2)
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
 import warnings
 from typing import Dict, List, Optional, Sequence, Tuple, Union
+import torch
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.utils import is_cpu
 TokensText = Tuple[List[int], str]
@@ -247,6 +250,7 @@ def check_logprobs_close(
 def build_model_context(model_name: str,
                        tokenizer_name: Optional[str] = None,
                        trust_remote_code: bool = False,
+                        dtype: Optional[Union[str, torch.dtype]] = None,
                        mm_processor_kwargs: Optional[Dict] = None,
                        limit_mm_per_prompt: Optional[Dict] = None):
    """Creates an InputContext for a given model.
@@ -264,12 +268,15 @@ def build_model_context(model_name: str,
    """
    if tokenizer_name is None:
        tokenizer_name = model_name
+    if dtype is None:
+        dtype = "bfloat16" if is_cpu() else "half"
    model_config = ModelConfig(
        model_name,
        tokenizer_name,
        tokenizer_mode="auto",
        trust_remote_code=trust_remote_code,
-        dtype="float32",
+        dtype=dtype,
        seed=0,
        mm_processor_kwargs=mm_processor_kwargs,
        limit_mm_per_prompt=limit_mm_per_prompt,

--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -61,7 +61,7 @@ async def test_evil_forward(tmp_socket):
        # Throws an error in first forward pass.
        with pytest.raises(RAISED_ERROR):
-            async for _ in client.generate(inputs="Hello my name is",
+            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=SamplingParams(),
                                           request_id=uuid.uuid4()):
                pass
@@ -69,7 +69,7 @@ async def test_evil_forward(tmp_socket):
        # Engine is errored, should get ENGINE_DEAD_ERROR.
        with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(inputs="Hello my name is",
+            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=SamplingParams(),
                                           request_id=uuid.uuid4()):
                pass
@@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket):
        # Generate call should throw ENGINE_DEAD_ERROR
        with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(inputs="Hello my name is",
+            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=SamplingParams(),
                                           request_id=uuid.uuid4()):
                pass
@@ -160,7 +160,7 @@ async def test_failed_abort(tmp_socket):
        # with reference to the original KeyError("foo")
        with pytest.raises(MQEngineDeadError) as execinfo:
            async for _ in client.generate(
-                    inputs="Hello my name is",
+                    prompt="Hello my name is",
                    sampling_params=SamplingParams(max_tokens=10),
                    request_id=uuid.uuid4()):
                pass
@@ -183,7 +183,7 @@ async def test_bad_request(tmp_socket):
        # Invalid request should fail, but not crash the server.
        with pytest.raises(ValueError):
-            async for _ in client.generate(inputs="Hello my name is",
+            async for _ in client.generate(prompt="Hello my name is",
                                           sampling_params=SamplingParams(),
                                           request_id="abcd-1",
                                           lora_request=LoRARequest(
@@ -192,7 +192,7 @@ async def test_bad_request(tmp_socket):
                pass
        # This request should be okay.
-        async for _ in client.generate(inputs="Hello my name is",
+        async for _ in client.generate(prompt="Hello my name is",
                                       sampling_params=SamplingParams(),
                                       request_id="abcd-2"):
            pass

--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -20,7 +20,7 @@ async def generate(
    count = 0
    async for out in client.generate(
            request_id=request_id,
-            inputs="Hello my name is Robert and",
+            prompt="Hello my name is Robert and",
            sampling_params=SamplingParams(max_tokens=num_tokens,
                                           temperature=0)):

--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -17,7 +17,6 @@ NUM_PROMPTS = [10]
 DEFAULT_SERVER_ARGS: List[str] = [
    "--disable-log-requests",
-    "--use-v2-block-manager",
    "--worker-use-ray",
    "--gpu-memory-utilization",
    "0.85",
@@ -37,6 +36,7 @@ DEFAULT_SERVER_ARGS: List[str] = [
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("is_async", [True])
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
 @pytest.mark.asyncio
 async def test_multi_step(
    example_prompts,
@@ -49,6 +49,7 @@ async def test_multi_step(
    is_async: bool,
    num_logprobs: Optional[int],
    attention_backend: str,
+    enable_chunked_prefill: bool,
    monkeypatch,
 ) -> None:
    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
@@ -74,6 +75,10 @@ async def test_multi_step(
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                    completions endpoint; `None` -> no logprobs
    """
+    if enable_chunked_prefill and \
+        (pp_size > 1 or attention_backend != "FLASH_ATTN"):
+        pytest.skip("Multi-step with Chunked-Prefill only supports"
+                    "PP=1 and FLASH_ATTN backend")
    override_backend_env_variable(monkeypatch, attention_backend)
@@ -93,6 +98,9 @@ async def test_multi_step(
    if eager_mode:
        ms_server_args.append("--enforce-eager")
+    if enable_chunked_prefill:
+        ms_server_args.append("--enable-chunked-prefill")
    distributed_args = [
        "--tensor-parallel-size",
        str(tp_size),
@@ -133,3 +141,85 @@ async def test_multi_step(
        name_0="hf",
        name_1="vllm",
    )
+@pytest.mark.parametrize(("tp_size, pp_size"), [
+    (1, 2),
+])
+@pytest.mark.asyncio
+async def test_multi_step_pp_smoke(
+    tp_size: int,
+    pp_size: int,
+    monkeypatch,
+) -> None:
+    """
+    Smoke test for the vLLM engine with multi-step scheduling in an
+    OpenAI-protocol client/server environment.
+    This tests compares the outputs between multi-step scheduling and
+    single-step scheduling. Notably, this test lets the engines generate
+    more tokens (default is 5) and test for an exact match over all the
+    tokens.
+    Args:
+      tp_size: degree of tensor-parallelism
+      pp_size: degree of pipeline-parallelism
+      eager_mode
+    """
+    model = "JackFram/llama-160m"
+    num_scheduler_steps = 8
+    attention_backend = "FLASH_ATTN"
+    max_num_seqs = 3
+    override_backend_env_variable(monkeypatch, attention_backend)
+    # Prompt from the ShareGPT dataset
+    prompts = [
+        "in the jtbd context whats a push?",  # codespell:ignore
+        "in the jtbd context whats a push?",  # codespell:ignore
+        "in the jtbd context whats a push?",  # codespell:ignore
+        "in the jtbd context whats a push?",  # codespell:ignore
+    ]
+    # Use varying max_tokens to introduce scheduling randomness.
+    max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
+    assert len(prompts) == len(max_tokens)
+    test_args = [
+        "--tensor-parallel-size",
+        str(tp_size), "--pipeline-parallel-size",
+        str(pp_size), "--max-num-seqs",
+        str(max_num_seqs)
+    ]
+    server_args = DEFAULT_SERVER_ARGS + test_args
+    ms_server_args = DEFAULT_SERVER_ARGS + \
+       ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
+       test_args
+    # Spin up client/server & issue completion API requests.
+    # Default `max_wait_seconds` is 240 but was empirically
+    # was raised 3x to 720 *just for this test* due to
+    # observed timeouts in GHA CI
+    ref_completions = await completions_with_server_args(
+        prompts=prompts,
+        model_name=model,
+        server_cli_args=server_args,
+        num_logprobs=None,
+        max_wait_seconds=5 * 240,
+        max_tokens=max_tokens)
+    test_completions = await completions_with_server_args(
+        prompts=prompts,
+        model_name=model,
+        server_cli_args=ms_server_args,
+        num_logprobs=None,
+        max_wait_seconds=5 * 240,
+        max_tokens=max_tokens)
+    # Assert multi-step scheduling produces identical tokens
+    # to single-step scheduling.
+    ref_generations = get_client_text_generations(ref_completions)
+    test_generations = get_client_text_generations(test_completions)
+    assert ref_generations == test_generations
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
 # Test the LLMEngine with multi-step-decoding
+import copy
 from typing import Optional
 import pytest
@@ -16,6 +17,7 @@ NUM_PROMPTS = [10]
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
@@ -28,6 +30,7 @@ def test_multi_step_llm(
    model: str,
    dtype: str,
    tp_size: int,
+    enable_chunked_prefill: bool,
    max_tokens: int,
    enforce_eager: int,
    num_scheduler_steps: int,
@@ -51,6 +54,7 @@ def test_multi_step_llm(
      model: model under test (same for single- and multi-step engines)
      dtype: tensor datatype for engine to utilize
      tp_size: degree of tensor-parallelism
+      enable_chunked_prefill: chunked-prefill on/off
      max_tokens: the maximum number of tokens to generate
      enforce_eager
      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
@@ -72,7 +76,7 @@ def test_multi_step_llm(
            enforce_eager=enforce_eager,
            gpu_memory_utilization=0.7,
            tensor_parallel_size=tp_size,
-            use_v2_block_manager=True,
+            enable_chunked_prefill=enable_chunked_prefill,
            num_scheduler_steps=num_scheduler_steps,
    ) as vllm_model:
        vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
@@ -164,7 +168,6 @@ def test_multi_step_llm_w_prompt_logprobs(
            enforce_eager=enforce_eager,
            gpu_memory_utilization=0.7,
            tensor_parallel_size=tp_size,
-            use_v2_block_manager=True,
            num_scheduler_steps=num_scheduler_steps,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
@@ -192,3 +195,158 @@ def test_multi_step_llm_w_prompt_logprobs(
        name_0="hf",
        name_1="vllm",
    )
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.parametrize("num_logprobs", [None, 5])
+def test_multi_step_llm_chunked_prefill_prefix_cache(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    tp_size: int,
+    max_tokens: int,
+    enforce_eager: int,
+    num_scheduler_steps: int,
+    num_prompts: int,
+    num_logprobs: Optional[int],
+) -> None:
+    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
+    Set up contrived scenario which tests for a possible failure mode of
+    scheduling with multi-step+"single-step chunked prefill"+APC
+    "single-step chunked prefill" here refers to the current vLLM multi-step+
+    chunked-prefill implementation, which requires that a prefill may only
+    be scheduled in the same step as decodes if the prefill prompt fits in a
+    single chunk (note that "complete" multi-step+chunked-prefill would allow
+    a prefill to span multiple chunks & multiple steps but that is not yet
+    the case.)
+    "APC" is short for "automatic prefix caching".
+    This test creates a scenario where the scheduler must decide whether/how
+    to schedule a prefill with a prompt that exceeds the available token budget.
+    The correct behavior for multi-step+"single-step chunked prefill"+APC is to
+    put off scheduling the prefill until a future step.
+    Validate that:
+    * Multi-step kernels do not raise an exception due to incorrect scheduler
+      behavior
+    * Generated tokens match between
+      multi-step+"single-step chunked prefill"+APC and
+      single-step scheduling.
+    * (If logprobs are enabled) check logprobs are close enough
+    Args:
+      vllm_runner: vLLM model runner fixture
+      example_prompts: test fixture providing example prompts
+      model: model under test (same for single- and multi-step engines)
+      dtype: tensor datatype for engine to utilize
+      tp_size: degree of tensor-parallelism
+      max_tokens: the maximum number of tokens to generate
+      enforce_eager
+      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
+                           GPU -> CPU output transfer
+      num_prompts: number of example prompts under test
+      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
+                    completions endpoint; `None` -> 1 logprob returned.
+    """
+    # Set up contrived test for correct scheduling behavior with
+    # multi-step+"single-step chunked prefill"+APC.
+    #
+    # Assume block_size=16
+    #
+    # Assume max_num_batched_tokens=48
+    #   => Per-step token budget=48
+    #
+    # 1. Scheduler schedules 0th prompt (24 tokens)
+    #      => Remaining token budget=24
+    # 2. Scheduler attempts to schedule 1st prompt (30 tokens)
+    #    * 30 tokens exceeds 24 token remaining budget
+    #    * Correct behavior: do not schedule this prompt in this step
+    #    * Incorrect behavior: schedule prompt chunk
+    #      * `do_sample=False` for this prompt in this step
+    #      * Chunk size = (remaining tokens // block size) * block size
+    #
+    # The Incorrect scheduling behavior - if it occurs - will cause an exception
+    # in the model runner resulting from `do_sample=False`.
+    assert len(example_prompts) >= 2
+    challenge_prompts = copy.deepcopy(example_prompts)
+    challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
+                            'inference and serving engine for LLMs.\n'
+                            )  # 24 tok
+    challenge_prompts[1] = (
+        'Briefly describe the major milestones in the '
+        'development of artificial intelligence from 1950 to 2020.\n'
+    )  # 30 tok
+    # If necessary, adjust the length of `challenge_prompts` to match
+    # `num_prompts`
+    if len(challenge_prompts) < num_prompts:
+        challenge_prompts = (challenge_prompts *
+                             ((num_prompts // len(challenge_prompts)) + 1))
+    challenge_prompts = challenge_prompts[:num_prompts]
+    assert len(challenge_prompts) == num_prompts
+    # Single-step scheduler baseline
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            num_scheduler_steps=num_scheduler_steps,
+            max_model_len=48,
+            max_num_batched_tokens=48,
+            max_num_seqs=4,
+            block_size=16,
+    ) as vllm_model:
+        outputs_baseline = (vllm_model.generate_greedy(
+            challenge_prompts, max_tokens) if num_logprobs is None else
+                            vllm_model.generate_greedy_logprobs(
+                                challenge_prompts, max_tokens, num_logprobs))
+    # multi-step+"single-step chunked prefill"+APC
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enforce_eager=enforce_eager,
+            gpu_memory_utilization=0.7,
+            tensor_parallel_size=tp_size,
+            enable_chunked_prefill=True,
+            enable_prefix_caching=True,
+            num_scheduler_steps=num_scheduler_steps,
+            max_model_len=48,
+            max_num_batched_tokens=48,
+            max_num_seqs=4,
+            block_size=16,
+    ) as vllm_model:
+        outputs_w_features = (vllm_model.generate_greedy(
+            challenge_prompts, max_tokens) if num_logprobs is None else
+                              vllm_model.generate_greedy_logprobs(
+                                  challenge_prompts, max_tokens, num_logprobs))
+    if num_logprobs is None:
+        # No-logprobs test
+        check_outputs_equal(
+            outputs_0_lst=outputs_baseline,
+            outputs_1_lst=outputs_w_features,
+            name_0="multi-step",
+            name_1="multi-step+features",
+        )
+    else:
+        # Yes-logprobs test
+        check_logprobs_close(
+            outputs_0_lst=outputs_baseline,
+            outputs_1_lst=outputs_w_features,
+            name_0="multi-step",
+            name_1="multi-step+features",
+        )
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -5,7 +5,7 @@ from unittest.mock import patch
 import pytest
 import torch
-from vllm.inputs import InputContext, LLMInputs
+from vllm.inputs import DecoderOnlyInputs, InputContext, token_inputs
 from vllm.inputs.registry import InputRegistry
 from vllm.multimodal import MultiModalRegistry
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
@@ -31,7 +31,7 @@ def use_processor_mock():
    """Patches the internal model input processor with an override callable."""
    def custom_processor(ctx: InputContext,
-                         llm_inputs: LLMInputs,
+                         inputs: DecoderOnlyInputs,
                         *,
                         num_crops=DEFAULT_NUM_CROPS):
        # For testing purposes, we don't worry about the llm inputs / return
@@ -74,38 +74,61 @@ def mm_model_cls():
 # lambda whose signature matches max token calcs extra & mapper + extra kwargs
 get_num_crops = lambda ctx, *, num_crops=DEFAULT_NUM_CROPS: num_crops
 custom_mapper = lambda ctx, data, *, num_crops=DEFAULT_NUM_CROPS: {
-    "num_pixels": torch.zeros(size=(1, num_crops + 1, 3, 336, 336))
+    "pixel_values": torch.zeros(size=(1, num_crops + 1, 3, 336, 336))
 }
-### Test for default processor logic & mm_processor_kwargs wrapping
+### Tests for default processor logic & mm_processor_kwargs wrapping
 def test_default_processor_is_a_noop():
    """Ensure that by default, there is no processor override."""
    dummy_registry = InputRegistry()
    ctx = build_model_context(DUMMY_MODEL_ID)
    processor = dummy_registry.create_input_processor(ctx.model_config)
-    proc_inputs = LLMInputs(prompt_token_ids=[], prompt="")
+    proc_inputs = token_inputs(prompt_token_ids=[], prompt="")
    proc_outputs = processor(inputs=proc_inputs)
    assert proc_inputs is proc_outputs
-@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE])
+def _get_num_crops_info(init_num_crops: int, inference_num_crops: int):
-def test_processor_default_kwargs(use_processor_mock, num_crops):
+    """Get the init / inference kwargs and expected num_crops for this test."""
-    """Ensure input processors can use processor kwargs."""
-    dummy_registry = InputRegistry()
    # If we have a value for num_crops, pass the override value and make
    # sure we get that value as a return-value from out mock processor,
    # otherwise fall back to the default value
-    mm_processor_kwargs = None if num_crops is None else {
+    init_kwargs = None if init_num_crops is None else {
-        "num_crops": num_crops
+        "num_crops": init_num_crops
    }
-    expected_num_crops = DEFAULT_NUM_CROPS if num_crops is None else num_crops
+    inference_kwargs = None if inference_num_crops is None else {
-    ctx = build_model_context(DUMMY_MODEL_ID,
+        "num_crops": inference_num_crops
-                              mm_processor_kwargs=mm_processor_kwargs)
+    }
-    processor = dummy_registry.create_input_processor(ctx.model_config)
+    if inference_num_crops is not None:
+        expected_seq_count = inference_num_crops
+    elif init_num_crops is not None:
+        expected_seq_count = init_num_crops
+    else:
+        expected_seq_count = DEFAULT_NUM_CROPS
+    return init_kwargs, inference_kwargs, expected_seq_count
+@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
+    (None, None),
+    (NUM_CROPS_OVERRIDE, None),
+    (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
+])
+def test_input_processor_kwargs(use_processor_mock, init_num_crops,
+                                inference_num_crops):
+    """Ensure input processors can use processor kwargs."""
+    dummy_registry = InputRegistry()
+    init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
+        init_num_crops, inference_num_crops)
-    num_crops_val = processor(LLMInputs(prompt_token_ids=[], prompt=""))
+    ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
-    assert num_crops_val == expected_num_crops
+    processor = dummy_registry.create_input_processor(ctx.model_config)
+    num_crops_val = processor(
+        token_inputs(prompt_token_ids=[],
+                     prompt="",
+                     mm_processor_kwargs=inference_kwargs))
+    assert num_crops_val == expected_seq_count
 @pytest.mark.parametrize(
@@ -124,11 +147,16 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
                                            mm_processor_kwargs):
    """Ensure that input processors filter out invalid mm_processor_kwargs"""
    dummy_registry = InputRegistry()
+    # Should filter out the init time kwargs
    ctx = build_model_context(DUMMY_MODEL_ID,
                              mm_processor_kwargs=mm_processor_kwargs)
    processor = dummy_registry.create_input_processor(ctx.model_config)
-    num_crops_val = processor(LLMInputs(prompt_token_ids=[], prompt=""))
+    # Should filter out the inference time kwargs
+    num_crops_val = processor(
+        token_inputs(prompt_token_ids=[],
+                     prompt="",
+                     mm_processor_kwargs=mm_processor_kwargs))
    assert num_crops_val == DEFAULT_NUM_CROPS
@@ -271,32 +299,34 @@ def test_default_mapper_with_processer_kwargs(image_assets, num_crops):
    assert mapped_inputs["pixel_values"].shape[1] == num_crops + 1
-@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE])
+@pytest.mark.parametrize("init_num_crops,inference_num_crops", [
-def test_custom_mapper_kwarg_overrides(image_assets, num_crops):
+    (None, None),
+    (NUM_CROPS_OVERRIDE, None),
+    (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE),
+])
+def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
+                                       inference_num_crops):
    """Ensure custom mappers can use processor kwargs."""
-    mm_processor_kwargs = None if num_crops is None else {
+    init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info(
-        "num_crops": num_crops
+        init_num_crops, inference_num_crops)
-    }
-    expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
    ctx = build_model_context(MULTIMODAL_MODEL_ID,
                              trust_remote_code=True,
-                              mm_processor_kwargs=mm_processor_kwargs,
+                              mm_processor_kwargs=init_kwargs,
                              limit_mm_per_prompt={"image": 1})
    mm_registry = MultiModalRegistry()
    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    # Patch the image registry for phi3v with our lambda that is compatible
-    # with overrides, then ensure that calling the method correctly echos
-    # our num_crops value back from the mm_processor_kwargs.
    image = image_assets[0].pil_image
    mm_inputs = {"image": image}
-    with patch.object(
+    # Patch the image registry for phi3v with our lambda that is compatible
-            mm_registry._get_plugin("image"),
+    # with overrides, then ensure that calling the method correctly echos
-            "_default_input_mapper",
+    # our num_crops value back from the mm_processor_kwargs.
-        {mm_model_cls(): custom_mapper},
+    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
-    ):
+        mm_model_cls())
-        mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs)
+    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs,
+                                          inference_kwargs)
    assert mapped_inputs["pixel_values"].shape[1] == expected_seq_count + 1
@@ -316,6 +346,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, num_crops):
 def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
                                                mm_processor_kwargs):
    """Ensure that custom mappers filters out invalid mm_processor_kwargs"""
+    # Should filter out the init time kwargs
    ctx = build_model_context(MULTIMODAL_MODEL_ID,
                              trust_remote_code=True,
                              mm_processor_kwargs=mm_processor_kwargs,
@@ -323,17 +354,16 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
    mm_registry = MultiModalRegistry()
    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    # Patch the image registry for phi3v with our lambda that is compatible
-    # with overrides, then ensure that calling the method correctly echos
-    # our num_crops value back from the mm_processor_kwargs.
    image = image_assets[0].pil_image
    mm_inputs = {"image": image}
-    with patch.object(
+    # Patch the image registry for phi3v with our lambda that is compatible
-            mm_registry._get_plugin("image"),
+    # with overrides, then ensure that calling the method correctly echos
-            "_default_input_mapper",
+    # our num_crops value back from the mm_processor_kwargs.
-        {mm_model_cls(): custom_mapper},
+    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
-    ):
+        mm_model_cls())
-        mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs)
+    # Should filter out the inference time kwargs
+    mapped_inputs = mm_registry.map_input(
+        ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs)
    assert mapped_inputs["pixel_values"].shape[1] == DEFAULT_NUM_CROPS + 1
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
-from typing import Optional
-import torch
 from vllm import ModelRegistry
-from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-class MyOPTForCausalLM(OPTForCausalLM):
-    def compute_logits(
-            self, hidden_states: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
-        # this dummy model always predicts the first token
-        logits = super().compute_logits(hidden_states, sampling_metadata)
-        if logits is not None:
-            logits.zero_()
-            logits[:, 0] += 1.0
-        return logits
 def register():
-    # register our dummy model
+    # Test directly passing the model
+    from .my_opt import MyOPTForCausalLM
    if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
        ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
+    # Test passing lazy model
+    if "MyGemma2Embedding" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model(
+            "MyGemma2Embedding",
+            "vllm_add_dummy_model.my_gemma_embedding:MyGemma2Embedding",
+        )
+    if "MyLlava" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model("MyLlava",
+                                     "vllm_add_dummy_model.my_llava:MyLlava")
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+from typing import List, Optional, Union
+import torch
+from vllm.attention import AttentionMetadata
+from vllm.model_executor.models.gemma2 import Gemma2EmbeddingModel
+from vllm.sequence import IntermediateTensors
+class MyGemma2Embedding(Gemma2EmbeddingModel):
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = super().forward(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        if isinstance(hidden_states, IntermediateTensors):
+            return hidden_states
+        # Return all-zero embeddings
+        return torch.zeros_like(hidden_states)
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+from typing import Optional
+import torch
+from vllm.inputs import INPUT_REGISTRY
+from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
+                                              dummy_data_for_llava,
+                                              get_max_llava_image_tokens,
+                                              input_processor_for_llava)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+@MULTIMODAL_REGISTRY.register_image_input_mapper()
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
+class MyLlava(LlavaForConditionalGeneration):
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+from typing import Optional
+import torch
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+class MyOPTForCausalLM(OPTForCausalLM):
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits