[V0 Deprecation] Remove V0 Spec Decode workers (#21152)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V0 Deprecation] Remove V0 Spec Decode workers (#21152)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
dd572c0a · Woosuk Kwon · GitHub · 9ffe905a · 9ffe905a · 9ffe905a
Unverified Commit dd572c0a authored Jul 18, 2025 by Woosuk Kwon Committed by GitHub Jul 18, 2025
20 changed files
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from itertools import cycle
-
-import pytest
-
-from vllm import SamplingParams
-
-from ..utils import maybe_enable_chunked_prefill
-from .conftest import run_equality_correctness_test
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": True,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        7,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12])
-def test_logprobs_equality(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int, prefill_chunk_size: int):
-    """Verify output logprobs are equal with and without speculative decoding,
-        as well as with and without chunked prefill.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-160m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": "JackFram/llama-160m",
-        "num_speculative_tokens": 6,
-        "disable_logprobs": False,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
-                              per_test_common_llm_kwargs, baseline_llm_kwargs,
-                              test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int, logprobs: int):
-    """Veriy logprob greedy equality with different speculation lens.
-    """
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [{
-        "speculative_config": {
-            "model": "JackFram/llama-160m",
-            "num_speculative_tokens": 3,
-            "disable_logprobs": False,
-            # Artificially limit the draft model max model len; this forces
-            # vLLM to skip speculation once the sequences grow beyond 32-k
-            # tokens.
-            "max_model_len": 32,
-        },
-    }])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1])
-def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
-                                        per_test_common_llm_kwargs,
-                                        baseline_llm_kwargs, test_llm_kwargs,
-                                        batch_size: int, output_len: int,
-                                        seed: int, logprobs: int):
-    """Verify logprobs greedy equality when some sequences skip speculation.
-    """
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-160m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": False,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [6])
-def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
-                         per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, output_len: int,
-                         seed: int, logprobs: int):
-    """Verify at least one logprob result has num_logprobs+1, which tests the
-    case where the sampled token is not in top-k logprobs.
-
-    Ideally, this test should validate equality with non-spec by getting
-    logprobs. This is left as future improvement.
-    """
-    temperature = 1.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-        "San Francisco is know for its",
-        "Facebook was created in 2004 by",
-        "Curious George is a",
-        "Python 3.11 brings improvements to its",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-        logprobs=logprobs,
-    )
-
-    sd_args = {
-        **common_llm_kwargs,
-        **per_test_common_llm_kwargs,
-        **test_llm_kwargs,
-    }
-
-    with vllm_runner(**sd_args) as vllm_model:
-        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
-
-    num_returned_logprobs = [
-        len(seq_logprobs) for seq_logprobs in sd_outputs[-1]
-    ]
-
-    # Assert one of the returned logprobs has > num_logprobs (indicating the
-    # sampled token is not in top-k).
-    assert any(
-        [num_returned > logprobs for num_returned in num_returned_logprobs])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs": True,
-    },
-}])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("logprobs", [0])
-def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int, logprobs: int):
-    """Check the behavior when logprobs are disabled.
-    Token choices should match with the base model.
-    """
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""This docstring details important information on the testing methodology.
-
-Most of the tests rely on "greedy equality", where we expect the output of
-speculative decoding on a sequence to exactly match the output of normal non-
-speculative decoding.
-
-Since speculative decoding with rejection sampling guarantees that the output
-distribution matches the target model's output distribution (up to hardware
-numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
-equality.
-
-However, we still need to verify below scenario could be passed:
-    * Batch size 1 greedy equality
-    * Batch size >1 greedy equality
-    * Test greedy equality under preemption
-    * Test greedy equality under various number of speculative tokens.
-
-With those tests, we can say at least, Medusa would not break the
-correctness for the target model outputs.
-"""
-
-import pytest
-
-from ..utils import maybe_enable_chunked_prefill
-from .conftest import run_equality_correctness_test
-
-# main model
-# lmsys/vicuna-7b-v1.3 was to be used but it's causing
-# OOM in CI pipeline, so using a smaller model.
-MAIN_MODEL = "JackFram/llama-68m"
-
-# speculative model
-SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random"
-
-# max number of speculative tokens: this corresponds to
-# num_heads in the config.json of the speculator model.
-MAX_SPEC_TOKENS = 5
-
-# precision
-PRECISION = "float32"
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                       per_test_common_llm_kwargs,
-                                       baseline_llm_kwargs, test_llm_kwargs,
-                                       batch_size: int, output_len: int,
-                                       seed: int, prefill_chunk_size: int):
-    """Verify greedy equality with different batch size."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": False,
-        },
-    },
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": True,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    8,
-])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    seed: int, logprobs: int,
-                                    prefill_chunk_size: int):
-    """Verify greedy equality with different batch size."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        max_output_len=output_len,
-        seed=seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "enforce_eager": False,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_e2e_greedy_correctness_cuda_graph(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int, prefill_chunk_size: int):
-    """Verify greedy equality with cuda graph enabled and different 
-    batch sizes."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use small output len for fast test.
-        128,
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int, prefill_chunk_size: int):
-    """Verify greedy equality, even when some sequences are preempted mid-
-    generation.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            "speculative_config": {
-                "model": SPEC_MODEL,
-                "num_speculative_tokens": k,
-            },
-        }
-        # Try a range of num. speculative tokens
-        for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_different_k(vllm_runner, common_llm_kwargs,
-                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                            test_llm_kwargs, batch_size: int, output_len: int,
-                            seed: int, prefill_chunk_size: int):
-    """Verify that medusa speculative decoding produces exact equality
-    to without spec decode with different values of num_speculative_tokens.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
-                              per_test_common_llm_kwargs, baseline_llm_kwargs,
-                              test_llm_kwargs, batch_size: int,
-                              output_len: int, seed: int,
-                              prefill_chunk_size: int):
-    """Verify that medusa speculative decoding produces exact equality
-    to without spec decode when speculation is disabled for large
-    batch sizes.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-        "disable_mqa_scorer": True,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int, prefill_chunk_size: int):
-    """Verify that speculative decoding generates the same output 
-    with batch expansion scorer and mqa scorer.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""This docstring details important information on the testing methodology.
-
-Most of the tests rely on "greedy equality", where we expect the output of
-speculative decoding on a sequence to exactly match the output of normal non-
-speculative decoding.
-
-Since speculative decoding with rejection sampling guarantees that the output
-distribution matches the target model's output distribution (up to hardware
-numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
-equality.
-
-However, we still need to verify below scenario could be passed:
-    * Batch size 1 greedy equality
-    * Batch size >1 greedy equality
-    * Test greedy equality under preemption
-    * Test greedy equality under various number of speculative tokens.
-
-With those tests, we can say at least, MLPSpeculator would not break the
-correctness for the target model outputs.
-"""
-
-from unittest.mock import patch
-
-import pytest
-
-from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
-
-from ..utils import maybe_enable_chunked_prefill
-from .conftest import run_equality_correctness_test
-
-# main model
-MAIN_MODEL = "JackFram/llama-160m"
-
-# speculative model
-SPEC_MODEL = "ibm-ai-platform/llama-160m-accelerator"
-
-# max. number of speculative tokens: this corresponds to
-# n_predict in the config.json of the speculator model.
-MAX_SPEC_TOKENS = 3
-
-# precision
-PRECISION = "float32"
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [4, 32])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 32])
-def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    seed: int, prefill_chunk_size: int):
-    """Verify greedy equality with different batch size."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "disable_logprobs": False,
-        },
-    },
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "disable_logprobs": True,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [8])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                 per_test_common_llm_kwargs,
-                                 baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int, seed: int,
-                                 logprobs: int, prefill_chunk_size: int):
-    """Verify greedy equality with different batch size."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    # NOTE Test is sensitive enough st if we don't enable chunked prefill
-    # scheduling on baseline too, we get slightly different logprobs, ending
-    # up sampling different tokens at the tail (ie top tokens don't change).
-    # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
-    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        max_output_len=output_len,
-        seed=seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [2048])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
-                                 per_test_common_llm_kwargs,
-                                 baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int,
-                                 prefill_chunk_size: int, seed: int):
-    """Verify acceptance rate with different batch size and large output 
-    length."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  temperature=0.0,
-                                  seed=seed,
-                                  expected_acceptance_rate=0.48)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # Speculative config
-        "speculative_config": {
-            "model": SPEC_MODEL,
-        },
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
-@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
-@pytest.mark.parametrize("output_len", [64])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("temperature", [1.0])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-@pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    temperature: float,
-                                    prefill_chunk_size: int, seed: int):
-    """Verify seeded runs produce the same output."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  temperature=temperature,
-                                  seed=seed)
-
-    # Ensure this same test does fail if we _don't_ include per-request seeds
-    with pytest.raises(AssertionError):
-        run_equality_correctness_test(vllm_runner,
-                                      common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs,
-                                      test_llm_kwargs,
-                                      batch_size,
-                                      max_output_len=output_len,
-                                      temperature=temperature,
-                                      seed=seed,
-                                      disable_seed=True)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use small output len for fast test.
-        128,
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-@pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        prefill_chunk_size: int, seed: int):
-    """Verify greedy equality, even when some sequences are preempted mid-
-    generation.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use small output len for fast test.
-        128,
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-def test_mlp_e2e_greedy_correctness_with_padding(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        prefill_chunk_size: int, seed: int):
-    """Verify greedy equality when the vocab dimension is padded
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-
-    # Default pad_to is 64, test model has vocab_size of 32000
-    def patched_pad_vocab_size(vocab_size, pad_to=None):
-        return pad_vocab_size(vocab_size, pad_to=32064)
-
-    with patch(
-            "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
-            patched_pad_vocab_size):
-        run_equality_correctness_test(vllm_runner,
-                                      common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs,
-                                      test_llm_kwargs,
-                                      batch_size,
-                                      max_output_len=output_len,
-                                      seed=seed,
-                                      temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            "speculative_config": {
-                "model": SPEC_MODEL,
-                "num_speculative_tokens": k,
-            },
-        }
-        # Try a range of num. speculative tokens
-        for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-@pytest.mark.parametrize("seed", [1])
-def test_mlp_different_k(vllm_runner, common_llm_kwargs,
-                         per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int,
-                         prefill_chunk_size: int, seed: int, output_len: int):
-    """Verify that mlp speculative decoding produces exact equality
-    to without spec decode with different values of num_speculative_tokens.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "disable_by_batch_size": 4,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-# Speculative decoding is disabled when sequences reach decoding and the batch
-# consists of single-token requests. Hence we set `max_num_seqs`
-# >= `speculative_disable_by_batch_size` to test feature interaction.
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-@pytest.mark.parametrize("seed", [1])
-def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int,
-                           prefill_chunk_size: int, seed: int,
-                           output_len: int):
-    """Verify that mlp speculative decoding produces exact equality
-    to without spec decode when speculation is disabled for large
-    batch sizes.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": MAIN_MODEL,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "disable_mqa_scorer": True,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-@pytest.mark.parametrize("seed", [1])
-def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, prefill_chunk_size: int, seed: int):
-    """Verify that speculative decoding generates the same output 
-    with batch expansion scorer and mqa scorer.
-    """
-    maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""This docstring details important information on the testing methodology.
-
-Most of the tests rely on "greedy equality", where we expect the output of
-speculative decoding on a sequence to exactly match the output of normal non-
-speculative decoding.
-
-Since speculative decoding with rejection sampling guarantees that the output
-distribution matches the target model's output distribution (up to hardware
-numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
-equality.
-
-However, we still need to verify below scenario could be passed:
-    * Batch size 1 greedy equality
-    * Batch size >1 greedy equality
-    * Test greedy equality under preemption
-    * Test greedy equality under various number of speculative tokens.
-
-With those tests, we can say at least, mtp would not break the
-correctness for the target model outputs.
-"""
-
-import pytest
-
-from .conftest import run_equality_correctness_test
-
-# main model
-MAIN_MODEL = "luccafong/deepseek_mtp_main_random"
-
-# max. number of speculative tokens: this corresponds to
-# num_nextn_predict_layers in the config.json of the speculator model.
-MAX_SPEC_TOKENS = 1
-
-# precision
-PRECISION = "bfloat16"
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.85
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                    per_test_common_llm_kwargs,
-                                    baseline_llm_kwargs, test_llm_kwargs,
-                                    batch_size: int, output_len: int,
-                                    seed: int):
-
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.85
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": False,
-        },
-    },
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-            "disable_logprobs": True,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                 per_test_common_llm_kwargs,
-                                 baseline_llm_kwargs, test_llm_kwargs,
-                                 batch_size: int, output_len: int, seed: int,
-                                 logprobs: int):
-
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "enforce_eager": False,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-        "gpu_memory_utilization": 0.85
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
-                                               per_test_common_llm_kwargs,
-                                               baseline_llm_kwargs,
-                                               test_llm_kwargs,
-                                               batch_size: int,
-                                               output_len: int, seed: int):
-    """Verify greedy equality with cuda graph enabled and different
-    batch sizes."""
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "block_size": 8,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use small output len for fast test.
-        128,
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-def test_mtp_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
-    """Verify greedy equality, even when some sequences are preempted mid-
-    generation.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            "speculative_config": {
-                "num_speculative_tokens": k,
-            },
-        }
-        # Try a range of num. speculative tokens
-        for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_mtp_different_k(vllm_runner, common_llm_kwargs,
-                         per_test_common_llm_kwargs, baseline_llm_kwargs,
-                         test_llm_kwargs, batch_size: int, output_len: int,
-                         seed: int):
-    """Verify that mtp speculative decoding produces exact equality
-    to without spec decode with different values of num_speculative_tokens.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Precision
-        "dtype": PRECISION,
-
-        # Main model
-        "model_name": MAIN_MODEL,
-
-        # GPU memory utilization
-        "gpu_memory_utilization": 0.9
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int):
-    """Verify that mtp speculative decoding produces exact equality
-    to without spec decode when speculation is disabled for large
-    batch sizes.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-
-
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""This docstring details important information on the testing methodology.
-
-Most of the tests rely on "greedy equality", where we expect the output of
-speculative decoding on a sequence to exactly match the output of normal non-
-speculative decoding.
-
-Since speculative decoding with rejection sampling guarantees that the output
-distribution matches the target model's output distribution (up to hardware
-numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
-equality.
-
-For ngram lookup, its idea comes from https://github.com/apoorvumang/prompt-lookup-decoding,
-and is merged into transform code base: https://github.com/huggingface/transformers/pull/27775.
-Since there is no model is needed for generate the proposal, we could make
-the testcase much simpler than drafter multi-step one.
-
-However, we still need to verify below scenario could be passed:
-    * Batch size 1 greedy equality
-    * Batch size >1 greedy equality
-    * Test greedy equality under preemption
-    * Test greedy equality under various ngram sizes / speculative sizes
-
-With those tests, we can say at least, ngram spec would not break the
-correctness for the target model outputs.
-"""
-
-import pytest
-
-from ..utils import maybe_enable_chunked_prefill
-from .conftest import run_equality_correctness_test
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-68m",
-    },
-])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_mqa_scorer": False,
-        },
-    },
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_mqa_scorer": True,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    256,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
-@pytest.mark.parametrize("seed", [1])
-def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs, test_llm_kwargs,
-                                      batch_size: int, output_len: int,
-                                      prefill_chunk_size: int, seed: int):
-    """Verify greedy equality on a tiny model with different batch size."""
-    maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # Print spec metrics.
-        "disable_log_stats": False,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-68m",
-    },
-])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_logprobs": False,
-        },
-    },
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_logprobs": True,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    8,
-])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                   per_test_common_llm_kwargs,
-                                   baseline_llm_kwargs, test_llm_kwargs,
-                                   batch_size: int, output_len: int, seed: int,
-                                   logprobs: int):
-    """Verify greedy equality on a tiny model with different batch size."""
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        max_output_len=output_len,
-        seed=seed,
-        temperature=0.0,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "block_size": 16,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "model_name": "JackFram/llama-160m",
-    },
-])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-        },
-        "enable_chunked_prefill": False,
-    },
-    {
-        "speculative_config": {
-            "method": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-            "disable_mqa_scorer": True,
-        },
-        "enable_chunked_prefill": True,
-        "max_num_batched_tokens": 4,
-        "max_num_seqs": 4
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use small output len for fast test.
-        256,
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-def test_ngram_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
-    """Verify greedy equality, even when some sequences are preempted mid-
-    generation.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  temperature=0,
-                                  seed=seed)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            "speculative_config": {
-                "method": "ngram",
-                "num_speculative_tokens": k,
-                "prompt_lookup_max": 3,
-            },
-        }
-        # Try a range of common k, as well as large speculation.
-        for k in [1, 3, 5]
-    ] + [
-        {
-            "speculative_config": {
-                "method": "ngram",
-                "num_speculative_tokens": k,
-                "prompt_lookup_max": 1,
-            },
-        }
-        # Try a range of common k, as well as large speculation.
-        for k in [1, 3, 5]
-    ])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_ngram_different_k(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int):
-    """Verify that ngram speculative decoding produces exact equality
-    to without spec decode with many different values of k and
-    different ngram prompt_lookup_max.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 3,
-        "disable_by_batch_size": 4
-    },
-}, {
-    "speculative_config": {
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 3,
-        "disable_by_batch_size": 4,
-        "disable_mqa_scorer": True,
-    },
-    "enable_chunked_prefill": True,
-    "max_num_batched_tokens": 4,
-    "max_num_seqs": 4
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
-                             per_test_common_llm_kwargs, baseline_llm_kwargs,
-                             test_llm_kwargs, batch_size: int, output_len: int,
-                             seed: int):
-    """Verify that ngram speculative decoding produces exact equality
-    to without spec decode with many different values of k and
-    different ngram prompt_lookup_max.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "method": "ngram",
-        "num_speculative_tokens": 5,
-        "prompt_lookup_max": 3,
-        "disable_mqa_scorer": True,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_ngram_scorer(vllm_runner, common_llm_kwargs,
-                      per_test_common_llm_kwargs, baseline_llm_kwargs,
-                      test_llm_kwargs, batch_size: int, output_len: int,
-                      seed: int):
-    """Verify that ngram speculative decoding generates the same output 
-    with batch expansion scorer and mqa scorer.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from .conftest import run_equality_correctness_test
-
-# main model
-MAIN_MODEL = "JackFram/llama-68m"
-
-# speculative model
-SPEC_MODEL = "JackFram/llama-160m"
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-
-        # speculative config
-        "speculative_config": {
-            "model": "JackFram/llama-160m",
-            "num_speculative_tokens": 3,
-        },
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
-@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
-@pytest.mark.parametrize("batch_size", [1, 8, 32])
-@pytest.mark.parametrize("temperature", [0.1, 1.0])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        20,
-    ])
-def test_seeded_consistency(vllm_runner, common_llm_kwargs,
-                            per_test_common_llm_kwargs, baseline_llm_kwargs,
-                            test_llm_kwargs, batch_size: int,
-                            temperature: float, output_len: int):
-    """Verify outputs are consistent across multiple runs with same seed
-    """
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        max_output_len=output_len,
-        temperature=temperature,
-        disable_seed=False,
-    )
-
-    # Ensure this same test does fail if we _don't_ include per-request seeds
-    with pytest.raises(AssertionError):
-        run_equality_correctness_test(
-            vllm_runner,
-            common_llm_kwargs,
-            per_test_common_llm_kwargs,
-            baseline_llm_kwargs,
-            test_llm_kwargs,
-            batch_size,
-            max_output_len=output_len,
-            temperature=temperature,
-            disable_seed=True,
-        )
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
--- a/tests/spec_decode/test_memory_usage.py
+++ b/tests/spec_decode/test_memory_usage.py
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
--- a/tests/spec_decode/test_utils.py
+++ b/tests/spec_decode/test_utils.py
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -29,7 +29,6 @@ def test_sampler_output_initialization(sampler_output, sample_outputs):
    assert len(sampler_output) == len(sample_outputs)
    assert sampler_output.sampled_token_probs is None
    assert sampler_output.sampled_token_ids is None
-    assert sampler_output.spec_decode_worker_metrics is None


 def test_sampler_output_getitem(sampler_output, sample_outputs):

--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -40,12 +40,6 @@ def test_unsupported_configs(monkeypatch):
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")

-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                kv_cache_dtype="fp8",
-            ).create_engine_config()
-
        with pytest.raises(NotImplementedError):
            AsyncEngineArgs(
                model=MODEL,

--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -32,6 +32,5 @@ run_mypy vllm/lora
 run_mypy vllm/model_executor
 run_mypy vllm/plugins
 run_mypy vllm/prompt_adapter
-run_mypy vllm/spec_decode
 run_mypy vllm/worker
 run_mypy vllm/v1