remove unused code

84dfdb17 · zhuwenwen · f137e58c · f137e58c · f137e58c · f137e58c
Commit 84dfdb17 authored Jul 31, 2025 by zhuwenwen
20 changed files
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
-from array import array
-from typing import Callable, Dict, Mapping, Optional
-from unittest.mock import patch
-
-import pytest
-import torch
-import os
-
-from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext,
-                         InputRegistry, ProcessorInputs, token_inputs)
-from vllm.multimodal import MultiModalRegistry
-from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
-
-from ..models.utils import build_model_context
-from ..utils import models_path_prefix
-
-# Used for fast tests where the model doesn't matter
-DUMMY_MODEL_ID = os.path.join(models_path_prefix, "facebook/opt-125m")
-# Used for tests that need a multimodal model
-MULTIMODAL_MODEL_ID = os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")
-
-# For mm_processor_kwargs - we test overrides by defining mocks for each place
-# it is used, and ensuring that we can pass processor kwargs an override value
-# to receive the intended result for things like sequence length etc.
-DEFAULT_MAX_DYNAMIC_PATCH = 6
-MAX_DYNAMIC_PATCH_OVERRIDE = 4
-
-
-# Mocks for all of the places that we use the mm_processor_kwargs
-# to override values in different callables
-@pytest.fixture
-def use_processor_mock():
-    """Patches the internal model input processor with an override callable."""
-
-    def custom_processor(ctx: InputContext,
-                         inputs: DecoderOnlyInputs,
-                         *,
-                         max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
-        # For testing purposes, we don't worry about the prompt
-        return token_inputs(
-            prompt_token_ids=[],
-            mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch})
-
-    with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor",
-               return_value=custom_processor):
-        yield
-
-
-@pytest.fixture
-def use_dummy_data_mock():
-    """Patches the internal model input processor with an override callable."""
-
-    def custom_dummy_data_factory(self,
-                                  ctx: InputContext,
-                                  seq_len: int,
-                                  mm_counts: Mapping[str, int],
-                                  *,
-                                  max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH):
-        seq_data = SequenceData(
-            array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * max_dynamic_patch))
-        return DummyData(seq_data, None)
-
-    with patch(
-            "vllm.inputs.registry.InputRegistry._default_dummy_data_factory",
-            custom_dummy_data_factory):
-        yield
-
-
-# Lazy import to avoid CUDA reinitialization error
-def mm_model_cls():
-    from vllm.model_executor.models.internvl import InternVLChatModel
-
-    return InternVLChatModel
-
-
-# lambda whose signature matches max token calcs extra & mapper + extra kwargs
-get_max_dynamic_patch = lambda ctx, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: max_dynamic_patch  # noqa: E501
-custom_mapper = lambda ctx, data, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: {  # noqa: E501
-    "pixel_values": torch.zeros(size=(1, max_dynamic_patch + 1, 3, 448, 448))
-}
-
-
-### Tests for default processor logic & mm_processor_kwargs wrapping
-def test_default_processor_is_a_noop():
-    """Ensure that by default, there is no processor override."""
-    dummy_registry = InputRegistry()
-    ctx = build_model_context(DUMMY_MODEL_ID)
-    processor = dummy_registry.create_input_processor(ctx.model_config)
-    proc_inputs = token_inputs(prompt_token_ids=[], prompt="")
-    proc_outputs = processor(inputs=proc_inputs)
-    assert proc_inputs is proc_outputs
-
-
-def _get_max_dynamic_patch_info(init_max_dynamic_patch: int,
-                                inference_max_dynamic_patch: int):
-    """Get the init / inference kwargs and expected max_dynamic_patch."""
-    # If we have a value for max_dynamic_patch, pass the override value and make
-    # sure we get that value as a return-value from out mock processor,
-    # otherwise fall back to the default value
-    init_kwargs = None if init_max_dynamic_patch is None else {
-        "max_dynamic_patch": init_max_dynamic_patch
-    }
-    inference_kwargs = None if inference_max_dynamic_patch is None else {
-        "max_dynamic_patch": inference_max_dynamic_patch
-    }
-    if inference_max_dynamic_patch is not None:
-        expected_seq_count = inference_max_dynamic_patch
-    elif init_max_dynamic_patch is not None:
-        expected_seq_count = init_max_dynamic_patch
-    else:
-        expected_seq_count = DEFAULT_MAX_DYNAMIC_PATCH
-    return init_kwargs, inference_kwargs, expected_seq_count
-
-
-def _get_processed_max_dynamic_patch(
-    processor: Callable[[ProcessorInputs], ProcessorInputs],
-    inference_kwargs: Optional[Dict[str, int]],
-) -> int:
-    processed_inputs = processor(
-        token_inputs(prompt_token_ids=[],
-                     prompt="",
-                     mm_processor_kwargs=inference_kwargs))
-
-    assert "type" in processed_inputs
-    assert processed_inputs["type"] == "token"
-    assert "mm_processor_kwargs" in processed_inputs
-    return processed_inputs["mm_processor_kwargs"]["max_dynamic_patch"]
-
-
-@pytest.mark.parametrize(
-    "init_max_dynamic_patch,inference_max_dynamic_patch", [
-        (None, None),
-        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
-        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
-    ])
-def test_input_processor_kwargs(use_processor_mock, init_max_dynamic_patch,
-                                inference_max_dynamic_patch):
-    """Ensure input processors can use processor kwargs."""
-    dummy_registry = InputRegistry()
-
-    (init_kwargs, inference_kwargs,
-     expected_seq_count) = _get_max_dynamic_patch_info(
-         init_max_dynamic_patch, inference_max_dynamic_patch)
-
-    ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
-    processor = dummy_registry.create_input_processor(ctx.model_config)
-    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
-        processor, inference_kwargs)
-
-    assert max_dynamic_patch_val == expected_seq_count
-
-
-@pytest.mark.parametrize(
-    "mm_processor_kwargs",
-    [
-        # Not part of the signature
-        {
-            "does_not_exist": 100
-        },
-        # Part of the signature, not keyword only
-        {
-            "ctx": "something bad"
-        }
-    ])
-def test_processor_with_sad_kwarg_overrides(use_processor_mock,
-                                            mm_processor_kwargs):
-    """Ensure that input processors filter out invalid mm_processor_kwargs"""
-    dummy_registry = InputRegistry()
-    # Should filter out the init time kwargs
-    ctx = build_model_context(DUMMY_MODEL_ID,
-                              mm_processor_kwargs=mm_processor_kwargs)
-
-    processor = dummy_registry.create_input_processor(ctx.model_config)
-    # Should filter out the inference time kwargs
-    max_dynamic_patch_val = _get_processed_max_dynamic_patch(
-        processor, mm_processor_kwargs)
-    assert max_dynamic_patch_val == DEFAULT_MAX_DYNAMIC_PATCH
-
-
-### Test overrides for the dummy data
-@pytest.mark.parametrize("max_dynamic_patch",
-                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
-def test_dummy_data_kwarg_overrides(use_dummy_data_mock, max_dynamic_patch):
-    """Ensure dummy data factories can use processor kwargs."""
-    mm_processor_kwargs = None if max_dynamic_patch is None else {
-        "max_dynamic_patch": max_dynamic_patch
-    }
-    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
-                          if max_dynamic_patch is None else max_dynamic_patch)
-    dummy_registry = InputRegistry()
-    ctx = build_model_context(DUMMY_MODEL_ID,
-                              mm_processor_kwargs=mm_processor_kwargs)
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    # NOTE: seq_len is thrown away here since this will leverage the
-    # default dummy data factory that we have patched in, whose seq
-    # len is solely dependent on the value of the mm_processor_kwargs.
-    dummy_data = dummy_registry.dummy_data_for_profiling(
-        ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count
-
-
-@pytest.mark.parametrize(
-    "mm_processor_kwargs",
-    [
-        # Not part of the signature
-        {
-            "does_not_exist": 100
-        },
-        # Part of the signature, not keyword only
-        {
-            "ctx": "something bad"
-        }
-    ])
-def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock,
-                                             mm_processor_kwargs):
-    """Ensure the dummy data factory filters out invalid mm_processor_kwargs"""
-    dummy_registry = InputRegistry()
-    ctx = build_model_context(DUMMY_MODEL_ID,
-                              mm_processor_kwargs=mm_processor_kwargs)
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    # NOTE: seq_len is thrown away here since this will leverage the
-    # default dummy data factory that we have patched in, whose seq
-    # len is solely dependent on the value of the mm_processor_kwargs.
-    dummy_data = dummy_registry.dummy_data_for_profiling(
-        ctx.model_config, seq_len=-1, mm_registry=mm_registry)
-    assert len(
-        dummy_data.seq_data.prompt_token_ids) == DEFAULT_MAX_DYNAMIC_PATCH
-
-
-### Test overrides for the max token count per multimodal instance
-@pytest.mark.parametrize("max_dynamic_patch",
-                         [None, MAX_DYNAMIC_PATCH_OVERRIDE])
-def test_max_tokens_kwarg_overrides(max_dynamic_patch):
-    """Ensure max token calcs can use processor kwargs."""
-    mm_processor_kwargs = None if max_dynamic_patch is None else {
-        "max_dynamic_patch": max_dynamic_patch
-    }
-    expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH
-                          if max_dynamic_patch is None else max_dynamic_patch)
-
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs=mm_processor_kwargs,
-                              limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    # Patch the image registry for phi3v with our lambda that is compatible
-    # with overrides, then ensure that calling the method correctly echos
-    # our max_dynamic_patch value back from the mm_processor_kwargs.
-    with patch.object(
-            mm_registry._get_plugin("image"),
-            "_max_mm_tokens",
-        {mm_model_cls(): get_max_dynamic_patch},
-    ):
-        max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
-            ctx.model_config)
-
-    assert expected_seq_count == max_multimodal_tokens
-
-
-@pytest.mark.parametrize(
-    "mm_processor_kwargs",
-    [
-        # Not part of the signature
-        {
-            "does_not_exist": 100
-        },
-        # Part of the signature, not keyword only
-        {
-            "ctx": "something bad"
-        }
-    ])
-def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
-    """Ensure that max token calcs filters out invalid mm_processor_kwargs"""
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs=mm_processor_kwargs,
-                              limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    # Similar before, but since these kwargs get filtered,
-    # we always get our default value back.
-    with patch.object(
-            mm_registry._get_plugin("image"),
-            "_max_mm_tokens",
-        {mm_model_cls(): get_max_dynamic_patch},
-    ):
-        max_multimodal_tokens = mm_registry.get_max_multimodal_tokens(
-            ctx.model_config)
-
-    assert max_multimodal_tokens == DEFAULT_MAX_DYNAMIC_PATCH
-
-
-### Test overrides for the mapper
-@pytest.mark.parametrize(
-    "max_dynamic_patch",
-    [DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE])
-def test_default_mapper_with_processor_kwargs(image_assets, max_dynamic_patch):
-    """Ensure that the mapper processor kwargs can fall back to HF models."""
-    # NOTE - we don't validate bad inputs for the default mapper, because it's
-    # through the automodel interface in transformers, so we can't easily
-    # inspect what kwargs are or are not allowed.
-    ctx = build_model_context(
-        MULTIMODAL_MODEL_ID,
-        task="generate",
-        trust_remote_code=True,
-        mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch},
-        limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": image}
-
-    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs)
-    # pixel vals should have shape: [batch, max_dynamic_patch+1, ...]
-    assert mapped_inputs["pixel_values"].shape[1] == max_dynamic_patch + 1
-
-
-@pytest.mark.parametrize(
-    "init_max_dynamic_patch,inference_max_dynamic_patch", [
-        (None, None),
-        (MAX_DYNAMIC_PATCH_OVERRIDE, None),
-        (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE),
-    ])
-def test_custom_mapper_kwarg_overrides(image_assets, init_max_dynamic_patch,
-                                       inference_max_dynamic_patch):
-    """Ensure custom mappers can use processor kwargs."""
-    (init_kwargs, inference_kwargs,
-     expected_seq_count) = _get_max_dynamic_patch_info(
-         init_max_dynamic_patch, inference_max_dynamic_patch)
-
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs=init_kwargs,
-                              limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": image}
-
-    # Patch the image registry for phi3v with our lambda that is compatible
-    # with overrides, then ensure that calling the method correctly echos
-    # our max_dynamic_patch value back from the mm_processor_kwargs.
-    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
-        mm_model_cls())
-    mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs,
-                                          inference_kwargs)
-
-    assert mapped_inputs["pixel_values"].shape[1] == expected_seq_count + 1
-
-
-@pytest.mark.parametrize(
-    "mm_processor_kwargs",
-    [
-        # Not part of the signature
-        {
-            "does_not_exist": 100
-        },
-        # Part of the signature, not keyword only
-        {
-            "ctx": "something bad"
-        }
-    ])
-def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
-                                                mm_processor_kwargs):
-    """Ensure that custom mappers filters out invalid mm_processor_kwargs"""
-    # Should filter out the init time kwargs
-    ctx = build_model_context(MULTIMODAL_MODEL_ID,
-                              task="generate",
-                              trust_remote_code=True,
-                              mm_processor_kwargs=mm_processor_kwargs,
-                              limit_mm_per_prompt={"image": 1})
-
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    image = image_assets[0].pil_image
-    mm_inputs = {"image": image}
-
-    # Patch the image registry for phi3v with our lambda that is compatible
-    # with overrides, then ensure that calling the method correctly echos
-    # our max_dynamic_patch value back from the mm_processor_kwargs.
-    mm_registry._get_plugin("image").register_input_mapper(custom_mapper)(
-        mm_model_cls())
-    # Should filter out the inference time kwargs
-    mapped_inputs = mm_registry.map_input(
-        ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs)
-
-    assert mapped_inputs["pixel_values"].shape[1] == (
-        DEFAULT_MAX_DYNAMIC_PATCH + 1)
--- a/tests/prompt_adapter/__init__.py
+++ b/tests/prompt_adapter/__init__.py
--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import os
-
-import vllm
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from ..utils import models_path_prefix
-
-
-MODEL_PATH = os.path.join(models_path_prefix, "bigscience/bloomz-560m")
-PA_PATH = os.path.join(models_path_prefix, 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM')
-
-
-def do_sample(llm, pa_name: str, pa_id: int):
-
-    prompts = [
-        "Tweet text : @nationalgridus I have no water and the bill is \
-        current and paid. Can you do something about this? Label : ",
-        "Tweet text : @nationalgridus Looks good thanks! Label : "
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0.0,
-                                          max_tokens=3,
-                                          stop_token_ids=[3])
-
-    outputs = llm.generate(prompts,
-                           sampling_params,
-                           prompt_adapter_request=PromptAdapterRequest(
-                               pa_name, pa_id, PA_PATH, 8) if pa_id else None)
-
-    # Print the outputs.
-    generated_texts = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@pytest.mark.parametrize("enforce_eager", [True, False])
-def test_twitter_prompt_adapter(enforce_eager: bool):
-    llm = vllm.LLM(MODEL_PATH,
-                   enforce_eager=enforce_eager,
-                   enable_prompt_adapter=True,
-                   max_prompt_adapter_token=8)
-
-    expected_output = ['complaint', 'no complaint']
-
-    assert do_sample(llm, "twitter_pa", pa_id=1) == expected_output
--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import EngineArgs, LLMEngine, SamplingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from ..utils import models_path_prefix
-import os
-
-
-MODEL_PATH = os.path.join(models_path_prefix, "bigscience/bloomz-560m") 
-pa_path = os.path.join(models_path_prefix, 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM') 
-pa_path2 = os.path.join(models_path_prefix, 'swapnilbp/angry_tweet_ptune') 
-
-
-def do_sample(engine):
-
-    prompts = [
-        ("Tweet text: I have complaints! Label: ",
-         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
-         PromptAdapterRequest("hate_speech", 1, pa_path2, 8)),
-        ("Tweet text: I have no problems Label: ",
-         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
-         PromptAdapterRequest("hate_speech2", 2, pa_path2, 8)),
-        ("Tweet text: I have complaints! Label: ",
-         SamplingParams(temperature=0.0, max_tokens=3), None),
-        ("Tweet text: I have no problems Label: ",
-         SamplingParams(temperature=0.0, max_tokens=3, stop_token_ids=[3]),
-         PromptAdapterRequest("complain", 3, pa_path, 8)),
-    ]
-
-    request_id = 0
-    results = set()
-    while prompts or engine.has_unfinished_requests():
-        if prompts:
-            prompt, sampling_params, pa_request = prompts.pop(0)
-            engine.add_request(str(request_id),
-                               prompt,
-                               sampling_params,
-                               prompt_adapter_request=pa_request)
-            request_id += 1
-
-        request_outputs = engine.step()
-
-        for request_output in request_outputs:
-            if request_output.finished:
-                results.add(request_output.outputs[0].text)
-    return results
-
-
-def test_multi_prompt_adapters():
-    engine_args = EngineArgs(model=MODEL_PATH,
-                             max_prompt_adapters=3,
-                             enable_prompt_adapter=True,
-                             max_prompt_adapter_token=8)
-    engine = LLMEngine.from_engine_args(engine_args)
-    expected_output = {
-        ' quot;I', 'hate speech', 'no complaint', 'not hate speech'
-    }
-    assert do_sample(engine) == expected_output
--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from huggingface_hub import snapshot_download
-
-from vllm import EngineArgs, LLMEngine, SamplingParams
-from vllm.lora.request import LoRARequest
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from ..utils import models_path_prefix
-import os
-
-
-MODEL_PATH = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf") 
-# pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
-# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
-pa_path = os.path.join(models_path_prefix, "swapnilbp/llama_tweet_ptune") 
-lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
-
-
-def do_sample(engine):
-
-    prompt_text = "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]"  # noqa: E501
-
-    # first prompt with a prompt adapter and second without adapter
-    prompts = [
-        (prompt_text,
-         SamplingParams(temperature=0.0, max_tokens=100,
-                        stop=["[/assistant]"]),
-         PromptAdapterRequest("hate_speech", 1, pa_path,
-                              8), LoRARequest("sql_test", 1, lora_path)),
-        (prompt_text,
-         SamplingParams(temperature=0.0, max_tokens=100,
-                        stop=["[/assistant]"]), None,
-         LoRARequest("sql_test", 1, lora_path)),
-    ]
-
-    request_id = 0
-    results = set()
-    while prompts or engine.has_unfinished_requests():
-        if prompts:
-            prompt, sampling_params, pa_request, lora_request = prompts.pop(0)
-            engine.add_request(str(request_id),
-                               prompt,
-                               sampling_params,
-                               prompt_adapter_request=pa_request,
-                               lora_request=lora_request)
-            request_id += 1
-
-        request_outputs = engine.step()
-
-        for request_output in request_outputs:
-            if request_output.finished:
-                results.add(request_output.outputs[0].text)
-    return results
-
-
-def test_lora_prompt_adapter():
-    engine_args = EngineArgs(model=MODEL_PATH,
-                             enable_prompt_adapter=True,
-                             enable_lora=True,
-                             max_num_seqs=60,
-                             max_prompt_adapter_token=8)
-    engine = LLMEngine.from_engine_args(engine_args)
-    result = do_sample(engine)
-
-    expected_output = {
-        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' "  # noqa: E501
-    }
-    assert result == expected_output
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import os
-
-from vllm import SamplingParams
-
-from .conftest import get_output_from_llm_generator
-from ...utils import models_path_prefix
-
-
-@pytest.mark.parametrize("common_llm_kwargs",
-                         [{
-                             "model": os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-                         }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            # Speculative max model len > overridden max model len should raise.
-            "speculative_config": {
-                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-                "num_speculative_tokens": 5,
-                "max_model_len": 129,
-            },
-            "max_model_len": 128,
-        },
-        {
-            # Speculative max model len > draft max model len should raise.
-            # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
-            "speculative_config": {
-                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-                "num_speculative_tokens": 5,
-                "max_model_len": 2048 + 1,
-            },
-        },
-        {
-            # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
-            "speculative_config": {
-                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-                "num_speculative_tokens": 5,
-                "max_model_len": 131072 + 1,
-            },
-        },
-    ])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
-    """Verify that speculative decoding validates speculative_max_model_len.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    with pytest.raises(ValueError, match="cannot be larger than"):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
\ No newline at end of file
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""This docstring details important information on the testing methodology.
-
-Most of the tests rely on "greedy equality", where we expect the output of
-speculative decoding on a sequence to exactly match the output of normal non-
-speculative decoding.
-
-Since speculative decoding with rejection sampling guarantees that the output
-distribution matches the target model's output distribution (up to hardware
-numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
-equality.
-
-However, we still need to verify below scenario could be passed:
-    * Batch size 1 greedy equality
-    * Batch size >1 greedy equality
-    * Test greedy equality under preemption
-    * Test greedy equality under various number of speculative tokens.
-
-With those tests, we can say at least, EAGLE would not break the
-correctness for the target model outputs.
-"""
-
-import pytest
-import os
-
-from .conftest import run_equality_correctness_test
-from ...utils import models_path_prefix
-import vllm.envs as envs
-
-os.environ["LLAMA_NN"] = "0"
-
-# main model
-MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
-
-# speculative model
-SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random")
-
-# max. number of speculative tokens: this corresponds to
-# num_heads in the config.json of the speculator model.
-MAX_SPEC_TOKENS = 4
-
-# precision
-PRECISION = "float32" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half"
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs, test_llm_kwargs,
-                                      batch_size: int, output_len: int,
-                                      seed: int):
-
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs": True,
-    },
-}])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                   per_test_common_llm_kwargs,
-                                   baseline_llm_kwargs, test_llm_kwargs,
-                                   batch_size: int, output_len: int, seed: int,
-                                   logprobs: int):
-
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "enforce_eager": False,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness_cuda_graph(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
-    """Verify greedy equality with cuda graph enabled and different
-    batch sizes."""
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "block_size": 8,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use small output len for fast test.
-        128,
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
-    """Verify greedy equality, even when some sequences are preempted mid-
-    generation.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            "speculative_config": {
-                "model": SPEC_MODEL,
-                "num_speculative_tokens": k,
-            },
-        }
-        # Try a range of num. speculative tokens
-        for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_different_k(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int):
-    """Verify that eagle speculative decoding produces exact equality
-    to without spec decode with different values of num_speculative_tokens.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
-                             per_test_common_llm_kwargs, baseline_llm_kwargs,
-                             test_llm_kwargs, batch_size: int, output_len: int,
-                             seed: int):
-    """Verify that eagle speculative decoding produces exact equality
-    to without spec decode when speculation is disabled for large
-    batch sizes.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": "float16",
-
-        # Main model
-        "model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-llama2-chat-7B"),
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize("seed", [1])
-def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                             per_test_common_llm_kwargs,
-                                             baseline_llm_kwargs,
-                                             test_llm_kwargs, batch_size: int,
-                                             output_len: int, seed: int):
-
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # 2 for small prompt, 256//16 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 16,
-        "max_model_len": (2 + 256 // 16) * 16,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": "float16",
-
-        # Main model
-        "model_name": os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize("seed", [1])
-def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                             per_test_common_llm_kwargs,
-                                             baseline_llm_kwargs,
-                                             test_llm_kwargs, batch_size: int,
-                                             output_len: int, seed: int):
-
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # 2 for small prompt, 256//16 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 16,
-        "max_model_len": (2 + 256 // 16) * 16,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": "float16",
-
-        # Main model
-        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"),
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-Qwen2-7B-Instruct"),
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize("seed", [1])
-def test_qwen2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            output_len: int, seed: int):
-
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
-
-
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests which cover integration of the speculative decoding framework with
-other features, e.g. cuda graphs.
-"""
-
-import pytest
-import os
-
-from .conftest import run_equality_correctness_test
-from ...utils import models_path_prefix
-
-
-os.environ["LLAMA_NN"] = "0"
-
-MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": os.path.join(models_path_prefix,"JackFram/llama-68m"),
-
-        # Verify equality when cuda graphs allowed.
-        "enforce_eager": False,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            # Identical models.
-            "speculative_config": {
-                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-                "num_speculative_tokens": 5,
-            },
-        },
-    ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("output_len", [32])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
-                                per_test_common_llm_kwargs,
-                                baseline_llm_kwargs, test_llm_kwargs,
-                                batch_size: int, output_len: int, seed: int):
-    """Verify spec decode equality when cuda graphs are enabled.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        # Explicitly specify draft model quantization
-        {
-            "speculative_config": {
-                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
-                "num_speculative_tokens": 5,
-                "quantization": "gptq",
-            },
-        },
-        # Explicitly specify GPTQ-based draft model to use marlin quantization
-        {
-            "speculative_config": {
-                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
-                "num_speculative_tokens": 5,
-                "quantization": "marlin",
-            },
-        },
-        # Not explicitly specify draft model quantization
-        {
-            "speculative_config": {
-                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
-                "num_speculative_tokens": 5,
-                "quantization": None,
-            },
-        },
-    ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
-                                               per_test_common_llm_kwargs,
-                                               baseline_llm_kwargs,
-                                               test_llm_kwargs,
-                                               batch_size: int, seed: int):
-    """Verify spec decode works well with draft model quantization configs.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=32,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": MAIN_MODEL,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-        "num_speculative_tokens": 3,
-        "disable_mqa_scorer": True,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
-    """Verify that speculative decoding generates the same output
-    with batch expansion scorer and mqa scorer.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
\ No newline at end of file
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests which cover integration of the speculative decoding framework with
-tensor parallelism.
-"""
-
-import json
-from typing import Optional
-
-import pytest
-import torch
-import os
-
-from vllm.platforms import current_platform
-
-from .conftest import run_equality_correctness_test_tp
-from .conftest import run_equality_correctness_test
-from ...utils import models_path_prefix
-
-
-os.environ["LLAMA_NN"] = "0"
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor-parallel-size",
-        "2"
-    ]])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
-@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    [
-        "--speculative_config",
-        json.dumps({
-            "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-            "num_speculative_tokens": 3,
-        }),
-    ],
-    [
-        "--speculative_config",
-        json.dumps({
-            "model": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-        }),
-    ],
-])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
-                              baseline_llm_kwargs, test_llm_kwargs,
-                              batch_size: int, output_len: int, seed: int):
-    """Verify greedy equality when tensor parallelism is used.
-    """
-    if current_platform.is_rocm():
-        pytest.skip("hip is not well-supported yet")
-    run_equality_correctness_test_tp(os.path.join(models_path_prefix, "JackFram/llama-68m"),
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     output_len,
-                                     seed,
-                                     temperature=0.0)
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "tensor_parallel_size": 2,
-
-        # Precision
-        "dtype": "bfloat16",
-
-        # Main model
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-            "num_speculative_tokens": 5,
-            "draft_tensor_parallel_size": 1,
-        },
-    }])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp2(vllm_runner, common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            seed: int):
-    """Verify spec decode works well with smaller tp for draft models.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, max_output_len=32, seed=seed,
-                                     temperature=0.0)
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "tensor_parallel_size": 2,
-
-        # Precision
-        "dtype": "bfloat16",
-
-        # Main model
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-     [{
-        "enable_chunked_prefill": False,
-        "max_num_batched_tokens": 32,
-        "max_model_len": 32,
-        "max_num_seqs": 4
-    }])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-            "num_speculative_tokens": 5,
-            "draft_tensor_parallel_size": 1,
-        },
-    }])
-@pytest.mark.parametrize("logprobs", [None])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_chunked_prefill_tp2(vllm_runner, common_llm_kwargs,
-                                         per_test_common_llm_kwargs,
-                                         baseline_llm_kwargs, test_llm_kwargs,
-                                         logprobs: Optional[int],
-                                         batch_size: int, seed: int):
-    """Verify spec decode works well with same and different TP size for
-    the draft model with chunked prefill.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, max_output_len=32, seed=seed,
-                                     temperature=0.0)
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "tensor_parallel_size": 2,
-
-        # Precision
-        "dtype": "bfloat16",
-
-        # Main model
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-     [{
-        "enable_chunked_prefill": False,
-        "max_num_batched_tokens": 32,
-        "max_model_len": 32,
-        "max_num_seqs": 4
-    }])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-            "num_speculative_tokens": 5,
-            "draft_tensor_parallel_size": 1,
-        },
-    }])
-@pytest.mark.parametrize("logprobs", [2])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_chunked_prefill_tp2_with_logprobs(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
-        batch_size: int, seed: int):
-    """Verify spec decode works well with same and different TP size for
-    the draft model with chunked prefill.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0,
-                                     logprobs=logprobs)
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests which cover integration of the speculative decoding framework with
-tensor parallelism.
-"""
-
-import json
-
-import openai
-import pytest
-import torch
-import os
-
-from .conftest import run_equality_correctness_test_tp
-from .conftest import run_equality_correctness_test
-from ...utils import models_path_prefix
-
-os.environ["LLAMA_NN"] = "0"
-
-MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
-SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "tensor_parallel_size": 4,
-
-        # Precision
-        "dtype": "bfloat16",
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {},
-])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": 5,
-            "draft_tensor_parallel_size": 1,
-        },
-    }])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp4(vllm_runner, common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            seed: int):
-    """Verify spec decode works well with smaller tp for draft models.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, max_output_len=32, seed=seed,
-                                     temperature=0.0)
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "tensor_parallel_size": 4,
-
-        # Precision
-        "dtype": "bfloat16",
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": 5,
-            "max_model_len": 32,
-        },
-    }])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # This must be a good bit larger than speculative_max_model_len so that
-        # we can test the case where all seqs are skipped, but still small to
-        # ensure fast test.
-        64,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_skip_speculation(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                          baseline_llm_kwargs, test_llm_kwargs,
-                          batch_size: int, output_len: int, seed: int):
-    """Verify job failure with RuntimeError when all sequences skip speculation.
-    We do this by setting the max model len of the draft model to an
-    artificially low value, such that when the sequences grow beyond it, they
-    are skipped in speculative decoding.
-
-    TODO: fix it to pass without raising Error. (#5814)
-    """
-    with pytest.raises(RuntimeError):
-        run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size, max_output_len=output_len, seed=seed,
-                                        temperature=0.0)
\ No newline at end of file
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from itertools import cycle
-
-import pytest
-import os
-
-from vllm import SamplingParams
-
-from ..utils import maybe_enable_chunked_prefill
-from .conftest import run_equality_correctness_test
-from ...utils import models_path_prefix
-
-os.environ["LLAMA_NN"] = "0"
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-        "num_speculative_tokens": 3,
-        "disable_logprobs": True,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        7,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 8])
-def test_logprobs_equality(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int, prefill_chunk_size: int):
-    """Verify output logprobs are equal with and without speculative decoding,
-        as well as with and without chunked prefill.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
-        "num_speculative_tokens": 6,
-        "disable_logprobs": False,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
-                              per_test_common_llm_kwargs, baseline_llm_kwargs,
-                              test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int, logprobs: int):
-    """Veriy logprob greedy equality with different speculation lens.
-    """
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [{
-        "speculative_config": {
-            "model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
-            "num_speculative_tokens": 3,
-            "disable_logprobs": False,
-            # Artificially limit the draft model max model len; this forces
-            # vLLM to skip speculation once the sequences grow beyond 32-k
-            # tokens.
-            "max_model_len": 32,
-        },
-    }])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1])
-def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
-                                        per_test_common_llm_kwargs,
-                                        baseline_llm_kwargs, test_llm_kwargs,
-                                        batch_size: int, output_len: int,
-                                        seed: int, logprobs: int):
-    """Verify logprobs greedy equality when some sequences skip speculation.
-    """
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [6])
-def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
-                         per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, output_len: int,
-                         seed: int, logprobs: int):
-    """Verify at least one logprob result has num_logprobs+1, which tests the
-    case where the sampled token is not in top-k logprobs.
-
-    Ideally, this test should validate equality with non-spec by getting
-    logprobs. This is left as future improvement.
-    """
-    temperature = 1.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-        "San Francisco is know for its",
-        "Facebook was created in 2004 by",
-        "Curious George is a",
-        "Python 3.11 brings improvements to its",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-        logprobs=logprobs,
-    )
-
-    sd_args = {
-        **common_llm_kwargs,
-        **per_test_common_llm_kwargs,
-        **test_llm_kwargs,
-    }
-
-    with vllm_runner(**sd_args) as vllm_model:
-        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
-
-    num_returned_logprobs = [
-        len(seq_logprobs) for seq_logprobs in sd_outputs[-1]
-    ]
-
-    # Assert one of the returned logprobs has > num_logprobs (indicating the
-    # sampled token is not in top-k).
-    assert any(
-        [num_returned > logprobs for num_returned in num_returned_logprobs])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": os.path.join(models_path_prefix,"JackFram/llama-160m"),
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-        "num_speculative_tokens": 3,
-        "disable_logprobs": True,
-    },
-}])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("logprobs", [0])
-def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int):
-    """Check the behavior when logprobs are disabled.
-    Token choices should match with the base model.
-    """
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""This docstring details important information on the testing methodology.
-
-Most of the tests rely on "greedy equality", where we expect the output of
-speculative decoding on a sequence to exactly match the output of normal non-
-speculative decoding.
-
-Since speculative decoding with rejection sampling guarantees that the output
-distribution matches the target model's output distribution (up to hardware
-numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
-equality.
-
-However, we still need to verify below scenario could be passed:
-    * Batch size 1 greedy equality
-    * Batch size >1 greedy equality
-    * Test greedy equality under preemption
-    * Test greedy equality under various number of speculative tokens.
-
-With those tests, we can say at least, Medusa would not break the
-correctness for the target model outputs.
-"""
-
-import pytest
-import os
-
-from ..utils import maybe_enable_chunked_prefill
-from .conftest import run_equality_correctness_test
-from ...utils import models_path_prefix
-
-os.environ["LLAMA_NN"] = "0"
-
-# main model
-# lmsys/vicuna-7b-v1.3 was to be used but it's causing
-# OOM in CI pipeline, so using a smaller model.
-MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
-
-# speculative model
-SPEC_MODEL = os.path.join(models_path_prefix, "abhigoyal/vllm-medusa-llama-68m-random")
-
-# max number of speculative tokens: this corresponds to
-# num_heads in the config.json of the speculator model.
-MAX_SPEC_TOKENS = 5
-
-# precision
-PRECISION = "float16"
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.6
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 4])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                       per_test_common_llm_kwargs,
-                                       baseline_llm_kwargs, test_llm_kwargs,
-                                       batch_size: int, output_len: int,
-                                       seed: int, prefill_chunk_size: int):
-    """Verify greedy equality with different batch size."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.6
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": False,
-        },
-    },
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": True,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    8,
-])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    seed: int, logprobs: int,
-                                    prefill_chunk_size: int):
-    """Verify greedy equality with different batch size."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        max_output_len=output_len,
-        seed=seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "enforce_eager": False,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.6
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 8])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 8])
-def test_medusa_e2e_greedy_correctness_cuda_graph(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int, prefill_chunk_size: int):
-    """Verify greedy equality with cuda graph enabled and different 
-    batch sizes."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.6
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use small output len for fast test.
-        128,
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int, prefill_chunk_size: int):
-    """Verify greedy equality, even when some sequences are preempted mid-
-    generation.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.6
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            "speculative_config": {
-                "model": SPEC_MODEL,
-                "num_speculative_tokens": k,
-            },
-        }
-        # Try a range of num. speculative tokens
-        for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_different_k(vllm_runner, common_llm_kwargs,
-                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                            test_llm_kwargs, batch_size: int, output_len: int,
-                            seed: int, prefill_chunk_size: int):
-    """Verify that medusa speculative decoding produces exact equality
-    to without spec decode with different values of num_speculative_tokens.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.6
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
-                              per_test_common_llm_kwargs, baseline_llm_kwargs,
-                              test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int,
-                              prefill_chunk_size: int):
-    """Verify that medusa speculative decoding produces exact equality
-    to without spec decode when speculation is disabled for large
-    batch sizes.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.6
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-        "disable_mqa_scorer": True,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int, prefill_chunk_size: int):
-    """Verify that speculative decoding generates the same output 
-    with batch expansion scorer and mqa scorer.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import os
-
-from .conftest import run_equality_correctness_test
-from ...utils import models_path_prefix
-
-# main model
-MAIN_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
-
-# speculative model
-SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # speculative config
-        "speculative_config": {
-            "model": os.path.join(models_path_prefix, "JackFram/llama-160m"),
-            "num_speculative_tokens": 3,
-        },
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
-@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
-@pytest.mark.parametrize("batch_size", [1, 8, 32])
-@pytest.mark.parametrize("temperature", [0.1, 1.0])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        20,
-    ])
-def test_seeded_consistency(vllm_runner, common_llm_kwargs,
-                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                            test_llm_kwargs, batch_size: int,
-                            temperature: float, output_len: int):
-    """Verify outputs are consistent across multiple runs with same seed
-    """
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        max_output_len=output_len,
-        temperature=temperature,
-        disable_seed=False,
-    )
-
-    # Ensure this same test does fail if we _don't_ include per-request seeds
-    with pytest.raises(AssertionError):
-        run_equality_correctness_test(
-            vllm_runner,
-            common_llm_kwargs,
-            per_test_common_llm_kwargs,
-            baseline_llm_kwargs,
-            test_llm_kwargs,
-            batch_size,
-            max_output_len=output_len,
-            temperature=temperature,
-            disable_seed=True,
-        )
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py