[V0 Deprecation] Remove V0 Spec Decode workers (#21152)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V0 Deprecation] Remove V0 Spec Decode workers (#21152)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
dd572c0a · Woosuk Kwon · GitHub · 9ffe905a · dd572c0a · dd572c0a
Unverified Commit dd572c0a authored Jul 18, 2025 by Woosuk Kwon Committed by GitHub Jul 18, 2025
20 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -159,7 +159,6 @@ steps:
  - tests/distributed/test_utils
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
-  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile/test_basic_correctness
  - examples/offline_inference/rlhf.py
  - examples/offline_inference/rlhf_colocate.py
@@ -182,7 +181,6 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
@@ -330,17 +328,6 @@ steps:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: Speculative decoding tests # 40min
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/spec_decode
-  - tests/spec_decode
-  - vllm/model_executor/models/eagle.py
-  commands:
-    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
-    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 - label: LoRA Test %N # 15min each
  mirror_hardwares: [amdexperimental, amdproduction]
  source_file_dependencies:
@@ -726,7 +713,6 @@ steps:
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
  # TODO: investigate and fix
-  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown

--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -43,7 +43,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat
-/tests/spec_decode @njhill @LiuXiaoxuanPKU
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm

--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -164,10 +164,7 @@ pull_request_rules:
  description: Automatically apply speculative-decoding label
  conditions:
    - or:
-      - files~=^vllm/spec_decode/
      - files~=^vllm/v1/spec_decode/
-      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
-      - files~=^tests/spec_decode/
      - files~=^tests/v1/spec_decode/
      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
      - files~=^vllm/model_executor/models/.*eagle.*\.py

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,7 +73,6 @@ line-length = 80
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
 "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
-"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
 # Python 3.8 typing - skip utils for ROCm
 "vllm/utils/__init__.py" = ["UP006", "UP035"]

--- a/tests/core/test_serialization.py
+++ b/tests/core/test_serialization.py
@@ -6,7 +6,7 @@ import msgspec
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.sequence import ExecuteModelRequest
-from ..spec_decode.utils import create_batch
+from .utils import create_batch
 def test_msgspec_serialization():

--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -4,15 +4,16 @@
 import time
 from collections import defaultdict
 from collections.abc import Sequence as GenericSequence
-from typing import Any, Optional
+from itertools import count
+from typing import Any, Optional, Union
 import torch
-from vllm import SamplingParams
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs
 from vllm.lora.request import LoRARequest
-from vllm.sequence import (Logprob, Sequence, SequenceGroup,
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Logprob, Sequence, SequenceData, SequenceGroup,
                           SequenceGroupMetadata)
@@ -262,3 +263,130 @@ class SchedulerProxy:
        self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
        _, _, ret = self.call_history["schedule"][-1]
        return ret
+def create_seq_group_metadata_from_prompts(
+    prompts: list[list[int]],
+    num_gpu_blocks: int,
+    block_size: int,
+    final_prompt_lens: list[int],
+    continuations: Optional[list[list[int]]] = None,
+    seq_ids: Optional[list[int]] = None,
+) -> list[SequenceGroupMetadata]:
+    if continuations is None:
+        continuations = [[] for _ in prompts]
+    if seq_ids is None:
+        seq_ids = list(i for i, _ in enumerate(prompts))
+    free_gpu_blocks = list(range(num_gpu_blocks))
+    block_allocations = {
+        i: [
+            free_gpu_blocks.pop()
+            for _ in range(round_up_to_next_block(final_len, block_size))
+        ]
+        for i, final_len in enumerate(final_prompt_lens)
+    }
+    seq_grou_metadata_list = []
+    for i, (prompt_token_ids,
+            cont_token_ids) in enumerate(zip(prompts, continuations)):
+        data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
+        data.update_num_computed_tokens(
+            len(prompt_token_ids) + len(cont_token_ids) - 1)
+        seq_data = {i: data}
+        seq_grou_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=str(i),
+                is_prompt=len(cont_token_ids) == 0,
+                seq_data=seq_data,
+                sampling_params=SamplingParams(temperature=0.0),
+                block_tables={i: block_allocations[i][:]},
+            ))
+    return seq_grou_metadata_list
+def create_chunked_seq_group_metadata_from_prompt(
+        prompt: list[int],
+        num_gpu_blocks: int,
+        chunk_size: int,
+        block_size: int,
+        seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]:
+    if seq_id is None:
+        seq_id = 0
+    free_gpu_blocks = list(range(num_gpu_blocks))
+    block_allocations = [
+        free_gpu_blocks.pop()
+        for _ in range(round_up_to_next_block(len(prompt), block_size))
+    ]
+    seq_group_metadata_list = []
+    for i, idx in enumerate(range(0, len(prompt), chunk_size)):
+        chunk_ids = prompt[idx:idx + chunk_size]
+        data = SequenceData.from_seqs(prompt)
+        data.update_num_computed_tokens(idx)
+        seq_data = {i: data}
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=str(seq_id),
+                is_prompt=True,
+                do_sample=idx + chunk_size >= len(prompt),  # terminal chunk
+                seq_data=seq_data,
+                sampling_params=SamplingParams(temperature=0.0),
+                block_tables={i: block_allocations},
+                token_chunk_size=len(chunk_ids)))
+    return seq_group_metadata_list
+def create_batch(batch_size,
+                 k,
+                 prompt_len: Union[int, list[int]] = 10,
+                 prev_output_token_len: int = 10,
+                 seq_ids: Optional[list[int]] = None,
+                 num_gpu_blocks: Optional[int] = None,
+                 block_size: Optional[int] = None,
+                 prefill_chunk_size: Optional[int] = None):
+    if block_size is None:
+        block_size = 8
+    if num_gpu_blocks is None:
+        num_gpu_blocks = 2048 // block_size
+    iterator = count()
+    if isinstance(prompt_len, int):
+        prompt_lens = [prompt_len for _ in range(batch_size)]
+    else:
+        prompt_lens = prompt_len
+    prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
+    if prefill_chunk_size:
+        # Create a batch of chunked prompts.
+        if not seq_ids:
+            seq_ids = list(range(len(prompts)))
+        seq_group_metadata_list = []
+        for p, sid in zip(prompts, seq_ids):
+            seq_group_metadata_list += \
+                create_chunked_seq_group_metadata_from_prompt(
+                p, num_gpu_blocks, prefill_chunk_size, block_size, sid)
+        seq_group_metadata_list = seq_group_metadata_list[:batch_size]
+        prev_output_tokens = []
+    else:
+        prev_output_tokens = [[
+            next(iterator) for _ in range(prev_output_token_len)
+        ] for _ in range(batch_size)]
+        final_prompt_lens = [
+            len(prompt) + len(prev_output_token) + k + 1
+            for prompt, prev_output_token in zip(prompts, prev_output_tokens)
+        ]
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts, num_gpu_blocks, block_size, final_prompt_lens,
+            prev_output_tokens, seq_ids)
+    return seq_group_metadata_list, prompts, prev_output_tokens
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
 import pytest
 import ray
 from prometheus_client import REGISTRY
 import vllm.envs as envs
 from vllm import EngineArgs, LLMEngine
-from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
@@ -232,149 +229,6 @@ def test_engine_log_metrics_regression(
    assert_metrics(model, engine, disable_log_stats, len(example_prompts))
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [10])
-def test_metric_spec_decode(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    k = 5
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-            gpu_memory_utilization=0.4,
-            speculative_config={
-                "model": model,
-                "num_speculative_tokens": k,
-            },
-    ) as vllm_model:
-        # Force log interval to be 0 to catch all metrics.
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
-        stat_logger.local_interval = 0
-        # Note that the purpose of this test is to verify spec decode
-        # metrics instead of functional correctness, so the expected values
-        # are intended to be loose.
-        metric_name_to_expected_fn = {
-            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
-            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
-            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
-            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
-            "counter_spec_decode_num_emitted_tokens":
-            lambda v: 0 <= v <= k + 1,
-        }
-        # Use one request to better inspect the metrics.
-        prompts = example_prompts[:1]
-        _ = vllm_model.generate_greedy(prompts, max_tokens)
-        for metric_name, is_expected in metric_name_to_expected_fn.items():
-            metric_val = getattr(
-                stat_logger.metrics,
-                metric_name).labels(**stat_logger.labels)._value.get()
-            assert is_expected(metric_val), (
-                f"the value of metric {metric_name} ({metric_val}) "
-                "does not meet expectation")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [10])
-@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
-def test_metric_spec_decode_interval(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    log_interval: int,
-) -> None:
-    k = 5
-    engine_args = EngineArgs(
-        model=model,
-        dtype=dtype,
-        disable_log_stats=False,
-        gpu_memory_utilization=0.4,
-        speculative_config={
-            "model": model,
-            "num_speculative_tokens": k,
-        },
-        enforce_eager=True,
-    )
-    engine = LLMEngine.from_engine_args(engine_args)
-    try:
-        engine.add_request(
-            "request-id-0",
-            example_prompts[0],
-            SamplingParams(max_tokens=max_tokens),
-        )
-        # set log internal
-        stat_logger = engine.stat_loggers['prometheus']
-        stat_logger.local_interval = log_interval
-        # prefill
-        engine.step()
-        # wait for 5 seconds to ensure that spec decode metrics
-        # get triggered in first decode step
-        time.sleep(5)
-        # first decode step should trigger async collection of metrics
-        engine.step()
-        # wait one second to allow H2D transfer to finish
-        time.sleep(1)
-        # second decode step should now be able to collect the spec
-        # decode stats and the request should also be finished
-        engine.step()
-        # must have finisehd now
-        assert not engine.has_unfinished_requests()
-        # wait to ensure logging occurs
-        time.sleep(log_interval)
-        # force logging
-        engine.step()
-        # Note that the purpose of this test is to verify spec decode
-        # metrics instead of functional correctness, so the expected values
-        # are intended to be loose.
-        metric_name_to_expected_fn = {
-            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
-            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
-            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
-            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
-            "counter_spec_decode_num_emitted_tokens":
-            lambda v: 0 <= v <= k + 1,
-        }
-        for metric_name, is_expected in metric_name_to_expected_fn.items():
-            metric_val = getattr(
-                stat_logger.metrics,
-                metric_name).labels(**stat_logger.labels)._value.get()
-            assert is_expected(metric_val), (
-                f"the value of metric {metric_name} ({metric_val}) "
-                "does not meet expectation")
-    finally:
-        del engine
-        cleanup_dist_env_and_memory()
 def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
                   num_requests: int) -> None:
    if disable_log_stats:

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -457,12 +457,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
 _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
-    "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m",
-                                  speculative_model="abhigoyal/vllm-eagle-llama-68m-random"),  # noqa: E501
    "MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
                                   speculative_model="abhigoyal/vllm-medusa-llama-68m-random"),  # noqa: E501
-    "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
+    # Temporarily disabled.
-                                                    speculative_model="ibm-ai-platform/llama-160m-accelerator"),  # noqa: E501
+    # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
+    # "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
+    #                                                 speculative_model="ibm-ai-platform/llama-160m-accelerator"),  # noqa: E501
    "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
                                        speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
                                        trust_remote_code=True),

--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -72,11 +72,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
+@pytest.mark.parametrize(
-    ("MLPSpeculatorPreTrainedModel", False, False),
+    "model_arch,is_pp,init_cuda",
+    [
+        # TODO(woosuk): Re-enable this once the MLP Speculator is supported
+        # in V1.
+        # ("MLPSpeculatorPreTrainedModel", False, False),
        ("DeepseekV2ForCausalLM", True, False),
        ("Qwen2VLForConditionalGeneration", True, True),
-])
+    ])
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for rejection sampling."""
-import pytest
-import torch
-import torch.nn.functional as F
-from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.model_executor.utils import set_random_seed
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This file tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-def mock_causal_accepted_tensor(
-        k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor:
-    """Generate an "accepted" tensor which should yield causally-accepted tokens
-    up to last accepted indices.
-    Tokens after last_accepted_indices+1 may also be accepted, although they
-    will not be causally accepted.
-    """
-    batch_size = last_accepted_indices.shape[0]
-    accepted = (torch.arange(k).expand(batch_size, k)
-                <= last_accepted_indices.unsqueeze(-1).broadcast_to(
-                    batch_size, k))
-    # Sprinkle accepted values after the contiguous initial accepted values.
-    # This replicates the behavior of rejection sampling, which may "accept"
-    # a token that cannot be accepted because of causality.
-    sprinkle_candidates = (torch.arange(k).expand(
-        batch_size,
-        k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
-                           1)
-    sprinkle = torch.rand(batch_size, k) > 0.5
-    accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
-    return accepted
-@pytest.mark.parametrize("seed", list(range(10)))
-@pytest.mark.parametrize(
-    "which_tokens_accepted",
-    ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False])
-@torch.inference_mode()
-def test_correct_output_format(which_tokens_accepted: str, seed: int,
-                               device: str, use_flashinfer: bool):
-    """Verify the output has correct format given predetermined accepted matrix.
-    """
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = 10
-    k = 5
-    vocab_size = 3000
-    if which_tokens_accepted == "all_tokens_accepted":
-        accepted = mock_causal_accepted_tensor(
-            k, -1 + k * torch.ones((batch_size, ), dtype=torch.long))
-    elif which_tokens_accepted == "no_tokens_accepted":
-        accepted = mock_causal_accepted_tensor(
-            k, -torch.ones((batch_size, ), dtype=torch.long))
-    elif which_tokens_accepted == "some_tokens_accepted":
-        last_accepted_indices = torch.randint(low=-1,
-                                              high=k,
-                                              size=(batch_size, ))
-        accepted = mock_causal_accepted_tensor(k, last_accepted_indices)
-    else:
-        raise AssertionError()
-    recovered_token_ids = torch.randint(low=0,
-                                        high=vocab_size,
-                                        size=(batch_size, k),
-                                        dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
-    rejection_sampler.init_gpu_tensors(device=device)
-    output_token_ids = rejection_sampler._create_output(  # pylint: disable=protected-access
-        accepted,
-        recovered_token_ids,
-        draft_token_ids,
-        bonus_token_ids,
-    )
-    expected_bonus_token_ids = bonus_token_ids.clone()
-    if which_tokens_accepted == "all_tokens_accepted":
-        # Expect all tokens to be equal to draft tokens.
-        assert torch.equal(output_token_ids[:, :-1], draft_token_ids)
-        # Expect all bonus tokens to be included.
-        assert torch.equal(output_token_ids[:, -1:], expected_bonus_token_ids)
-    elif which_tokens_accepted == "no_tokens_accepted":
-        # Expect first token to be equal to recovered tokens.
-        assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0])
-        # Expect everything else to be -1.
-        assert torch.equal(output_token_ids[:, 1:],
-                           torch.ones_like(output_token_ids[:, 1:]) * -1)
-    elif which_tokens_accepted == "some_tokens_accepted":
-        recovered_plus_bonus = torch.cat(
-            (recovered_token_ids, expected_bonus_token_ids), dim=-1)
-        # Assert first rejected token is a recovered token or bonus token.
-        assert torch.equal(
-            recovered_plus_bonus[torch.arange(0, batch_size),
-                                 last_accepted_indices + 1],
-            output_token_ids[torch.arange(0, batch_size),
-                             last_accepted_indices + 1])
-        # Assert every subsequent token is -1.
-        subsequent_mask = torch.arange(0, k + 1).expand(
-            batch_size, k + 1) >= (last_accepted_indices + 2).unsqueeze(-1)
-        assert torch.all(output_token_ids[subsequent_mask] == -1)
-@pytest.mark.parametrize("k", list(range(1, 6)))
-@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
-@pytest.mark.parametrize("batch_size", list(range(1, 32)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False])
-@torch.inference_mode()
-def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
-                                    device: str, use_flashinfer: bool):
-    torch.set_default_device(device)
-    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
-    rejection_sampler.init_gpu_tensors(device=device)
-    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                      draft_token_ids)
-@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
-@pytest.mark.parametrize("k", [1, 3, 6])
-@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
-@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
-@pytest.mark.parametrize("n_rep", [100])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-# @pytest.mark.parametrize("use_flashinfer", [True, False])
-# Not testing FlashInfer now, since 0.2.3 API removed the ability
-# to pass in uniform samples.
-@pytest.mark.parametrize("use_flashinfer", [False])
-@torch.inference_mode()
-def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
-                                   frac_seeded: float, n_rep: int, device: str,
-                                   use_flashinfer: bool):
-    torch.set_default_device(device)
-    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
-    rejection_sampler.init_gpu_tensors(device=device)
-    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
-    results = []
-    for _ in range(n_rep):
-        seeded_seqs = {
-            i: torch.Generator(device=device).manual_seed(i)
-            for i in range(batch_size) if seeded_mask[i]
-        }
-        results.append(
-            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                              draft_token_ids, seeded_seqs))
-    for i in range(batch_size):
-        if seeded_mask[i]:
-            for j in range(1, n_rep):
-                assert torch.equal(results[j][i], results[0][i])
-@pytest.mark.parametrize("k", [1, 3, 6])
-@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
-@pytest.mark.parametrize("batch_size", [3, 8, 32, 128])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-# @pytest.mark.parametrize("use_flashinfer", [True, False])
-# Not testing FlashInfer now, since 0.2.3 API removed the ability
-# to pass in uniform samples.
-@pytest.mark.parametrize("use_flashinfer", [False])
-@torch.inference_mode()
-def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
-                            device: str, use_flashinfer: bool):
-    torch.set_default_device(device)
-    set_random_seed(0)
-    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    single_batches = []
-    for i in range(batch_size):
-        single_batches.append((draft_probs[i].clone().unsqueeze(0),
-                               draft_token_ids[i].clone().unsqueeze(0),
-                               target_probs[i].clone().unsqueeze(0),
-                               bonus_token_ids[i].clone().unsqueeze(0),
-                               draft_token_ids[i].clone().unsqueeze(0)))
-    set_random_seed(0)
-    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
-    rejection_sampler.init_gpu_tensors(device=device)
-    results = []
-    seeded_seqs = {
-        i: torch.Generator(device=device).manual_seed(i)
-        for i in range(1, batch_size)  # 0 is seed None
-    }
-    batch_result = rejection_sampler(target_probs.clone(),
-                                     bonus_token_ids.clone(),
-                                     draft_probs.clone(),
-                                     draft_token_ids.clone(), seeded_seqs)
-    set_random_seed(0)
-    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
-    rejection_sampler.init_gpu_tensors(device=device)
-    for i in range(batch_size):
-        request_seeded_seqs = {
-            0: torch.Generator(device=device).manual_seed(i)
-        } if seeded_seqs.get(i) is not None else None
-        (draft_probs, draft_token_ids, target_probs, bonus_token_ids,
-         draft_token_ids) = single_batches[i]
-        results.append(
-            rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                              draft_token_ids, request_seeded_seqs))
-    for i in range(batch_size):
-        assert torch.equal(batch_result[i], results[i].squeeze(0))
-@pytest.mark.parametrize("k", [1, 3, 6])
-@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
-@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
-                                       batch_size: int, device: str):
-    """
-    Test the flashinfer and nonflashinfer backend generate 
-    the same output metrics.
-    """
-    pytest.skip("Not testing FlashInfer now, since 0.2.3 API removed "
-                "the ability to pass in uniform samples.")
-    torch.set_default_device(device)
-    torch.manual_seed(0)
-    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    num_accepted_tokens = []
-    num_emitted_tokens = []
-    num_draft_tokens = []
-    def get_seeded_seqs():
-        return {
-            i: torch.Generator(device=device).manual_seed(i)
-            for i in range(batch_size)
-        }
-    for use_flashinfer in [True, False]:
-        rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
-        rejection_sampler.init_gpu_tensors(device=device)
-        # We use seeded sequences to ensure the same tokens are accepted
-        # for both flashinfer and nonflashinfer backends.
-        seeded_seqs = get_seeded_seqs()
-        rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                          draft_token_ids, seeded_seqs)
-        num_accepted_tokens.append(rejection_sampler.num_accepted_tokens)
-        num_emitted_tokens.append(rejection_sampler.num_emitted_tokens)
-        num_draft_tokens.append(rejection_sampler.num_draft_tokens)
-    assert num_accepted_tokens[0] == num_accepted_tokens[1]
-    assert num_emitted_tokens[0] == num_emitted_tokens[1]
-    assert num_draft_tokens[0] == num_draft_tokens[1]
-@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
-@pytest.mark.parametrize("which_token_ids",
-                         ["bonus_token_ids", "draft_token_ids"])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False])
-@torch.inference_mode()
-def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
-                               which_token_ids: str, device: str,
-                               use_flashinfer: bool):
-    k = 3
-    batch_size = 5
-    vocab_size = 30_000
-    torch.set_default_device(device)
-    rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer,
-                                         strict_mode=True)
-    rejection_sampler.init_gpu_tensors(device=device)
-    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size,
-                              k + 1,
-                              vocab_size,
-                              dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    oob_token_ids = None
-    if which_token_ids == "bonus_token_ids":
-        oob_token_ids = bonus_token_ids
-    elif which_token_ids == "draft_token_ids":
-        oob_token_ids = draft_token_ids
-    else:
-        raise AssertionError()
-    if above_or_below_vocab_range == "above":
-        rogue_token_id = vocab_size + 1
-    elif above_or_below_vocab_range == "below":
-        rogue_token_id = -1
-    else:
-        raise AssertionError()
-    oob_token_ids[0][0] = rogue_token_id
-    with pytest.raises(AssertionError):
-        rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                          draft_token_ids)
-@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
-@pytest.mark.parametrize("seed", list(range(5)))
-@pytest.mark.parametrize("use_flashinfer", [True, False])
-@torch.inference_mode()
-def test_rejection_sampling_approximates_target_distribution(
-        seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
-    """Verify rejection sampling approximates target distribution,
-    despite sampling from a potentially distinct draft distribution.
-    This is done by first creating a random target probability
-    distribution and a random draft probability distribution. We then
-    sample token ids from the rejection sampler using these draft
-    and target distributions. The samples are used to estimate
-    the output probability distribution, which we expect to approximate
-    the target distribution.
-    A basic distance metric is used to determine similarity between
-    distributions.
-    We expect that as we increase the number of samples,
-    the distance between the observed distribution and the target
-    distribution decreases. To measure this, we compare the distance
-    of the observed distribution against both the target distribution
-    and a uniform random distribution. We expect the distance between
-    the observed distribution and the target distribution to improve
-    much more than the distance improvement between the observed
-    distribution and the random distribution.
-    When draft_and_target_probs_equal=True, the draft and target
-    probabilities are exactly equal. Rejection sampling should
-    still work without any NaNs or exceptions.
-    """
-    torch.set_default_device("cpu")
-    set_random_seed(seed)
-    helper = _CorrectnessTestHelper(
-        vocab_size=10,
-        rejection_sampler=RejectionSampler(use_flashinfer=use_flashinfer),
-    )
-    draft_probs, target_probs, reference_probs = helper.generate_probs_for_test(
-        draft_and_target_probs_equal)
-    sample_sizes = [10, 100, 1_000, 10_000, 100_000]
-    distance_wrt_reference: list[float] = []
-    distance_wrt_target: list[float] = []
-    for num_samples in sample_sizes:
-        (reference_vs_rejsample_dist,
-         target_vs_rejsample_dist) = helper.run_and_compare_distributions(
-             draft_probs,
-             target_probs,
-             reference_probs,
-             num_samples,
-         )
-        distance_wrt_reference.append(reference_vs_rejsample_dist)
-        distance_wrt_target.append(target_vs_rejsample_dist)
-        relative_change_in_distance_wrt_target = get_ratio_first_to_last(
-            distance_wrt_target)
-        relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
-            distance_wrt_reference)
-        print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
-              f"{reference_vs_rejsample_dist=:.05f}")
-        print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
-              f"{relative_change_in_distance_wrt_reference=:.02f}")
-    relative_change_in_distance_wrt_target = get_ratio_first_to_last(
-        distance_wrt_target)
-    relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
-        distance_wrt_reference)
-    expected_improvement_multiplier = 20
-    assert (relative_change_in_distance_wrt_target
-            > relative_change_in_distance_wrt_reference *
-            expected_improvement_multiplier)
-def get_ratio_first_to_last(elements: list[float]) -> float:
-    return elements[0] / elements[-1]
-class _CorrectnessTestHelper:
-    """Class that packages together logic required for the unit-level
-    rejection sampling correctness test.
-    """
-    def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler):
-        self.rejection_sampler = rejection_sampler
-        self.vocab_size = vocab_size
-        self.vocab_range = (0, vocab_size)
-        self.rejection_sampler.init_gpu_tensors(device=0)
-        # Keep test simple, use k=1
-        self.k = 1
-        # Bonus tokens not used, but rejection sampler requires
-        # correct shape.
-        self.num_bonus_tokens = 1
-    def generate_probs_for_test(
-        self, draft_and_target_probs_equal: bool
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        draft_probs, target_probs = (F.softmax(
-            torch.rand(self.vocab_size, dtype=torch.float32),
-            dim=-1,
-        ) for _ in range(2))
-        num_reference_probs = 100
-        reference_probs = F.softmax(
-            torch.rand(num_reference_probs,
-                       self.vocab_size,
-                       dtype=torch.float32),
-            dim=-1,
-        )
-        if draft_and_target_probs_equal:
-            target_probs = draft_probs.clone()
-        return draft_probs, target_probs, reference_probs
-    def run_and_compare_distributions(self, draft_probs: torch.Tensor,
-                                      target_probs: torch.Tensor,
-                                      reference_probs: torch.Tensor,
-                                      num_samples: int) -> tuple[float, float]:
-        # Sample using rejection sampling.
-        rej_sample_probs = self._estimate_rejection_sampling_pdf(
-            draft_probs, target_probs, num_samples)
-        # Average distance from reference probs.
-        reference_vs_rejsample_dist = torch.dist(
-            reference_probs,
-            rej_sample_probs).item() / reference_probs.shape[0]
-        target_vs_rejsample_dist = torch.dist(target_probs,
-                                              rej_sample_probs).item()
-        return reference_vs_rejsample_dist, target_vs_rejsample_dist
-    def _estimate_rejection_sampling_pdf(
-        self,
-        draft_probs: torch.Tensor,
-        target_probs: torch.Tensor,
-        num_samples: int,
-    ) -> torch.Tensor:
-        # Repeat draft probs num_samples times.
-        draft_probs = draft_probs.reshape(1, self.k, self.vocab_size).repeat(
-            num_samples, 1, 1)
-        # Repeat target probs num_samples * (k + 1) times.
-        # Rejection sampler requires bonus token probs, but they aren't used.
-        target_probs = target_probs.reshape(1, 1, self.vocab_size).repeat(
-            num_samples, self.k + 1, 1)
-        # Randomly sample draft token ids from draft probs.
-        draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
-                                            num_samples=1,
-                                            replacement=True).reshape(
-                                                num_samples, self.k)
-        # Bonus tokens not used but required.
-        bonus_token_ids = torch.zeros((1, self.num_bonus_tokens),
-                                      dtype=torch.int64,
-                                      device="cuda").repeat(num_samples, 1)
-        # Get output tokens via rejection sampling.
-        output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
-                                                  bonus_token_ids.to("cuda"),
-                                                  draft_probs.to("cuda"),
-                                                  draft_token_ids.to("cuda"))
-        # Remove bonus tokens
-        output_token_ids = output_token_ids[:, :-1].flatten()
-        # Estimate probability density function
-        hist = torch.histogram(output_token_ids.to(dtype=torch.float,
-                                                   device="cpu"),
-                               bins=self.vocab_size,
-                               range=self.vocab_range,
-                               density=True)
-        return hist.hist
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for rejection sampling."""
-import pytest
-import torch
-from vllm.model_executor.layers.typical_acceptance_sampler import (
-    TypicalAcceptanceSampler)
-from vllm.model_executor.utils import set_random_seed
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This file tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
-    """
-    Generates a fake temperature zero probability distribution.
-    Returns:
-        1. A fake temperature zero probability distribution of shape
-           [batch_size, k, vocab_size]
-        2. Tensor of shape [batch_size, k] containing the token ids 
-           of the probability 1.0 tokens at each position.
-    """
-    # Simulate temperature 0 probability distribution for target probabilities
-    # and create target probabilities such that only 1 token id has
-    # probability 1.0
-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    probs = torch.rand(batch_size, k, vocab_size)
-    _, zero_temperature_token_ids = torch.max(probs, dim=-1)
-    # set the probability of the tokens with ids in zero_temperature_token_ids
-    # to 1 and the rest to 0.
-    target_probs = torch.zeros_like(probs).scatter_(
-        -1, zero_temperature_token_ids.unsqueeze(-1), 1.0)
-    return target_probs, zero_temperature_token_ids
-def get_draft_token_ids(batch_size: int, k: int, vocab_size: int,
-                        token_ids_to_exclude: torch.Tensor):
-    """
-    Returns a tensor of shape [batch_size, k] of fake draft token ids
-    drawn randomly from a vocab of size vocab_size. We however ensure
-    that token_ids from token_ids_to_exclude are excluded at the 
-    corresponding positions.
-    """
-    draft_token_ids = torch.empty(batch_size, k, dtype=torch.long)
-    for i in range(batch_size):
-        for j in range(k):
-            # Generate a random token ID excluding token_ids_to_exclude[i, j]
-            while True:
-                token_id = torch.randint(0, vocab_size, (1, )).item()
-                if token_id != token_ids_to_exclude[i, j]:
-                    draft_token_ids[i, j] = token_id
-                    break
-    return draft_token_ids
-def get_acceptance_sampler(
-    posterior_threshold: float = 0.03,
-    posterior_alpha: float = 0.9,
-    strict_mode: bool = False,
-) -> TypicalAcceptanceSampler:
-    """
-    Initializes and returns a TypicalAcceptanceSampler.
-    """
-    return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha,
-                                    strict_mode)
-@pytest.mark.parametrize("k", list(range(1, 6)))
-@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
-@pytest.mark.parametrize("batch_size", list(range(1, 32)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
-                                    device: str):
-    """
-    Tests that the TypicalAcceptancSampler forward succeeds for
-    different combinations of k, vocab_size, batch_size and num devices.
-    """
-    torch.set_default_device(device)
-    typical_acceptance_sampler = get_acceptance_sampler()
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_with_bonus_probs = torch.rand(batch_size,
-                                         k + 1,
-                                         vocab_size,
-                                         dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    # Verify that sampling succeeds for all cases.
-    typical_acceptance_sampler(target_with_bonus_probs,
-                               bonus_token_ids,
-                               draft_probs=None,
-                               draft_token_ids=draft_token_ids)
-@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
-@pytest.mark.parametrize("which_token_ids",
-                         ["bonus_token_ids", "draft_token_ids"])
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
-                               which_token_ids: str, device: str):
-    """
-    Tests that we throw an exception of the token ids fall outside
-    the bound of the provided vocabulary.
-    """
-    k = 3
-    batch_size = 5
-    vocab_size = 30_000
-    torch.set_default_device(device)
-    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_with_bonus_probs = torch.rand(batch_size,
-                                         k + 1,
-                                         vocab_size,
-                                         dtype=torch.float32)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    # Verify that appropriate exceptions are thrown for out
-    # of bound vocabs.
-    oob_token_ids = None
-    if which_token_ids == "bonus_token_ids":
-        oob_token_ids = bonus_token_ids
-    elif which_token_ids == "draft_token_ids":
-        oob_token_ids = draft_token_ids
-    else:
-        raise AssertionError()
-    if above_or_below_vocab_range == "above":
-        rogue_token_id = vocab_size + 1
-    elif above_or_below_vocab_range == "below":
-        rogue_token_id = -1
-    else:
-        raise AssertionError()
-    oob_token_ids[0][0] = rogue_token_id
-    with pytest.raises(AssertionError):
-        typical_acceptance_sampler(target_with_bonus_probs,
-                                   bonus_token_ids,
-                                   draft_probs=None,
-                                   draft_token_ids=draft_token_ids)
-@pytest.mark.parametrize("seed", list(range(10)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_uniform_target_distribution_accepts_all_tokens(
-        seed: int, device: str):
-    """
-     Test the TypicalAcceptanceSampler with a uniform target probability 
-     distribution.
-    This test verifies that when provided with a uniform target probability
-    distribution, the TypicalAcceptanceSampler accepts all draft tokens. The
-    entropy of the uniform target distribution being high should lead to all
-    draft tokens being accepted.
-    """
-    set_random_seed(seed)
-    k = 3
-    batch_size = 5
-    vocab_size = 30_000
-    torch.set_default_device(device)
-    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_with_bonus_probs = torch.rand(batch_size,
-                                         k + 1,
-                                         vocab_size,
-                                         dtype=torch.float32)
-    draft_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, k),
-                                    dtype=torch.int64)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(
-        target_with_bonus_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
-    # We are using a uniform target probability distribution.
-    # For a uniform distribution the entropy is very high and it
-    # should lead to all draft tokens being accepted. Verify that.
-    assert output_token_ids.shape[0] == batch_size
-    assert output_token_ids.shape[1] == (k + 1)
-    assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze())
-    assert torch.all(output_token_ids[:, :k] == draft_token_ids)
-@pytest.mark.parametrize("seed", list(range(10)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_temperature_zero_target_distribution(seed: int, device: str):
-    """
-    Test the TypicalAcceptanceSampler with a zero-temperature target
-    probability distribution.
-    This test verifies that when using a zero-temperature target probability
-    distribution, where only one token has a probability of 1.0, the
-    TypicalAcceptanceSampler correctly rejects all draft tokens that do not
-    match this probability. Additionally, it ensures that when all draft
-    tokens are rejected, the sampler falls back to greedy sampling to select a
-    single token from the target distribution.
-    """
-    set_random_seed(seed)
-    k = 3
-    batch_size = 5
-    vocab_size = 30_000
-    torch.set_default_device(device)
-    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    # Simulate temperature 0 probability distribution for target probabilities
-    # and create target probabilities such that only 1 token id has
-    # probability 1.0
-    target_with_bonus_probs, zero_temperature_token_ids = \
-        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
-    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
-    # Populate draft_token_ids such that they exclude the token_ids
-    # with probability = 1.0
-    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
-                                          zero_temperature_token_ids)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    # The target probaility distribution is a temperature zero distribution
-    # with zero entropy. Since our draft token ids don't match the probability
-    # 1.0 tokens in the target distribution we will reject all of them and
-    # fallback to the greedy sampling for selecting 1 token for each sequence.
-    # Verify the same.
-    output_token_ids = typical_acceptance_sampler(
-        target_with_bonus_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
-    assert output_token_ids.shape[0] == batch_size
-    assert output_token_ids.shape[1] == (k + 1)
-    assert torch.all(output_token_ids[:, -1] == -1)
-    assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:,
-                                                                          0])
-@pytest.mark.parametrize("seed", list(range(10)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_mixed_target_distribution(seed: int, device: str):
-    """
-    Test the TypicalAcceptanceSampler with a mixed target probability
-    distribution.
-    This test ensures that the TypicalAcceptanceSampler handles a mixed
-    target probability distribution correctly. Specifically, it uses a 
-    zero-temperature distribution for some sequences and a uniform
-    distribution for others. The test verifies that:
-    - For sequences with a zero-temperature distribution, only the token
-    with a probability of 1.0 is accepted, and all other tokens are rejected.
-    - For sequences with a uniform distribution, all draft tokens are
-    accepted.
-    """
-    set_random_seed(seed)
-    k = 3
-    batch_size = 4
-    vocab_size = 30_000
-    torch.set_default_device(device)
-    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    # For sequences 0 and 2 set the distribution to a temperature
-    # zero distribution. For sequences 1 and 3 set it to a uniform
-    # distribution.
-    target_with_bonus_probs, zero_temperature_token_ids = \
-        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
-    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
-    target_probs = target_with_bonus_probs[:, :-1]
-    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
-                                          zero_temperature_token_ids)
-    uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32)
-    target_probs[[1, 3]] = uniform_probs
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(
-        target_with_bonus_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
-    # verify the shape of output_token_ids
-    assert output_token_ids.shape[0] == batch_size
-    assert output_token_ids.shape[1] == (k + 1)
-    # For sequences 0 and 2 verify that only 1 token is accepted
-    # which is the token with probability 1.0 in the target distribution
-    # at position 0.
-    assert torch.all(output_token_ids[[0, 2], 1:] == -1)
-    assert (torch.all(output_token_ids[[0, 2],
-                                       0] == zero_temperature_token_ids[[0, 2],
-                                                                        0]))
-    # For sequences 1 and 3 verify that all tokens are accepted since the
-    # target probability distribution is uniform. In addition verify that
-    # we also accept the bonus tokens.
-    assert torch.all(
-        output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :])
-    assert torch.all(output_token_ids[[1, 3], -1] != -1)
-@pytest.mark.parametrize("seed", list(range(10)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_accept_tokens_partially(seed: int, device: str):
-    """
-    Test the TypicalAcceptanceSampler's behavior when only a subset of draft
-    tokens should be accepted.
-    This test verifies that the TypicalAcceptanceSampler correctly accepts or
-    rejects draft tokens based on a zero-temperature target probability
-    distribution. Specifically, it ensures that:
-    - When all draft tokens match tokens with a probability of 1.0 in the
-    target distribution, all draft tokens are accepted.
-    - When only some draft tokens match tokens with a probability of 1.0 in
-    the target distribution, only those matching tokens are accepted, and the
-    rest are rejected.
-    """
-    set_random_seed(seed)
-    k = 5
-    batch_size = 1
-    vocab_size = 30_000
-    torch.set_default_device(device)
-    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    # Create a temperature zero target probability distribution and ensure
-    # all draft token ids correspond to the tokens with 1.0 probability.
-    # Verify that all of them are accepted.
-    target_with_bonus_probs, zero_temperature_token_ids = \
-        get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size)
-    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
-    draft_token_ids = zero_temperature_token_ids
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(
-        target_with_bonus_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
-    assert output_token_ids.shape[0] == batch_size
-    assert output_token_ids.shape[1] == (k + 1)
-    assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
-    assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
-    # Next only keep the first 2 draft tokens same as the zero temperature
-    # tokens. For the remaining 3 choose some other tokens. In the
-    # response we will expect the first 2 tokens to be the same as the
-    # draft tokens and the recovered token and rest as -1
-    draft_token_ids_to_replace = get_draft_token_ids(
-        batch_size, k, vocab_size, zero_temperature_token_ids)
-    draft_token_ids = torch.cat(
-        (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1)
-    output_token_ids = typical_acceptance_sampler(
-        target_with_bonus_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
-    assert output_token_ids.shape[0] == batch_size
-    assert output_token_ids.shape[1] == (k + 1)
-    assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2])
-    assert torch.all(
-        output_token_ids[:, 2] == target_with_bonus_probs.argmax(-1)[:, 2])
-    assert torch.all(output_token_ids[:, -3:] == -1)
-@pytest.mark.parametrize("seed", list(range(1)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_accept_tokens_set_non_default_posteriors(seed: int, device: str):
-    """
-    Test the TypicalAcceptanceSampler with custom posterior thresholds and 
-    alpha values. This test verifies that by modifying the posterior
-    thresholds and alpha values we can change the acceptance behavior of the
-    sampler. 
-    """
-    set_random_seed(seed)
-    k = 5
-    batch_size = 1
-    vocab_size = 30_000
-    torch.set_default_device(device)
-    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    # Simulate temperature 0 probability distribution for target
-    # probabilities and create target probabilities such that only 1 token
-    # id has probability 1.0 and others have a very low probability of
-    # 0.00001. Populate draft_token_ids such that they exclude the token_ids
-    # with probability = 1.0. Without any changes to the posterior thresholds
-    # none of the draft tokens are accepted.
-    target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist(
-        batch_size, k + 1, vocab_size)
-    zero_temperature_token_ids = zero_temperature_token_ids[:, :-1]
-    target_probs[target_probs == 0] = 0.00001
-    draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size,
-                                          zero_temperature_token_ids)
-    bonus_token_ids = torch.randint(low=0,
-                                    high=vocab_size,
-                                    size=(batch_size, 1),
-                                    dtype=torch.int64)
-    output_token_ids = typical_acceptance_sampler(
-        target_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
-    assert output_token_ids.shape[0] == batch_size
-    assert output_token_ids.shape[1] == (k + 1)
-    assert torch.all(output_token_ids[:, 1:-1] == -1)
-    # Change the posterior threshold values to 0.0 so that we will
-    # now accept even draft tokens with very low probability in the
-    # target distribution. Simulate and verify the same.
-    typical_acceptance_sampler = TypicalAcceptanceSampler(
-        strict_mode=True, posterior_threshold=0.0, posterior_alpha=0.0)
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    output_token_ids = typical_acceptance_sampler(
-        target_probs,
-        bonus_token_ids,
-        draft_probs=None,
-        draft_token_ids=draft_token_ids)
-    assert output_token_ids.shape[0] == batch_size
-    assert output_token_ids.shape[1] == (k + 1)
-    assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids)
-    assert torch.all(output_token_ids[:, -1] == bonus_token_ids)
-@pytest.mark.parametrize("seed", list(range(10)))
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@torch.inference_mode()
-def test_get_recovered_token_ids(seed: int, device: str):
-    """
-    Test the TypicalAcceptanceSampler's method for generating
-    replacement token IDs.
-    This test verifies that the `_get_recovered_token_ids` method of the 
-    TypicalAcceptanceSampler correctly identifies the token IDs to be used
-    as recovered token IDs based on the target probability distribution.
-    Specifically, it ensures that the method correctly identifies the
-    tokens with the highest probability for each sequence in the batch.
-    """
-    set_random_seed(seed)
-    k = 10
-    batch_size = 5
-    vocab_size = 30_000
-    torch.set_default_device(device)
-    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
-    typical_acceptance_sampler.init_gpu_tensors(device=device)
-    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    expected_replacement_tokens = torch.argmax(target_probs, dim=-1)
-    actual_replacement_tokens = (
-        typical_acceptance_sampler._get_recovered_token_ids(target_probs))
-    assert torch.all(expected_replacement_tokens == actual_replacement_tokens)
--- a/tests/spec_decode/__init__.py
+++ b/tests/spec_decode/__init__.py
--- a/tests/spec_decode/conftest.py
+++ b/tests/spec_decode/conftest.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/spec_decode/e2e/__init__.py
+++ b/tests/spec_decode/e2e/__init__.py
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from itertools import cycle
-from typing import Optional, Union
-import pytest
-import torch
-from vllm import LLM, SamplingParams
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import PromptLogprobs, SampleLogprobs
-from ...models.utils import (TokensTextLogprobs,
-                             TokensTextLogprobsPromptLogprobs,
-                             check_logprobs_close, check_outputs_equal)
-from ...utils import RemoteOpenAIServer
-PROMPTS = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-    "San Francisco is know for its",
-    "Facebook was created in 2004 by",
-    "Curious George is a",
-    "Python 3.11 brings improvements to its",
-]
-@pytest.fixture
-def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                       test_llm_kwargs, seed):
-    def generate():
-        kwargs = {
-            **common_llm_kwargs,
-            **per_test_common_llm_kwargs,
-            **test_llm_kwargs,
-        }
-        llm = LLM(**kwargs)
-        if seed is not None:
-            set_random_seed(seed)
-        yield llm
-        del llm
-        cleanup_dist_env_and_memory()
-    return generate
-def maybe_assert_ngram_worker(llm):
-    # Verify the proposer worker is ngram if ngram is specified.
-    if (llm.llm_engine.speculative_config is not None
-            and llm.llm_engine.speculative_config.method == "ngram"):
-        from vllm.spec_decode.ngram_worker import NGramWorker
-        assert isinstance(
-            llm.llm_engine.model_executor.driver_worker.proposer_worker,
-            NGramWorker)
-def get_output_from_llm_generator(
-        llm_generator, prompts,
-        sampling_params) -> tuple[list[str], list[list[int]], float]:
-    tokens: list[str] = []
-    token_ids: list[list[int]] = []
-    acceptance_rate: float = -1.0
-    for llm in llm_generator():
-        maybe_assert_ngram_worker(llm)
-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
-        token_ids = [output.outputs[0].token_ids for output in outputs]
-        tokens = [output.outputs[0].text for output in outputs]
-        # Fetch acceptance rate if logging is enabled.
-        if stat_loggers := getattr(llm.llm_engine, "stat_loggers", None):
-            stat_logger = stat_loggers["prometheus"]
-            acceptance_rate = (stat_logger.metrics.
-                               gauge_spec_decode_draft_acceptance_rate.labels(
-                                   **stat_logger.labels)._value.get())
-        del llm
-    return tokens, token_ids, acceptance_rate
-def check_logprobs_correctness(
-    spec_outputs: Sequence[Union[TokensTextLogprobs,
-                                 TokensTextLogprobsPromptLogprobs]],
-    baseline_outputs: Sequence[Union[TokensTextLogprobs,
-                                     TokensTextLogprobsPromptLogprobs]],
-    disable_logprobs: bool = False,
-):
-    """Compare sampled and prompt logprobs between baseline and spec decoding
-    """
-    if not disable_logprobs:
-        return check_logprobs_close(
-            outputs_0_lst=baseline_outputs,
-            outputs_1_lst=spec_outputs,
-            name_0="org",
-            name_1="sd",
-        )
-    # Check correctness when disable_logprobs == True
-    for spec_output, baseline_output in zip(spec_outputs, baseline_outputs):
-        # Check generated token logprobs.
-        spec_logprobs = spec_output[2]
-        baseline_logprobs = baseline_output[2]
-        _check_logprobs_when_output_disabled(spec_logprobs,
-                                             baseline_logprobs,
-                                             is_prompt_logprobs=False)
-        # Check prompt logprobs too, if they exist
-        if len(baseline_output) == 4:
-            assert len(spec_output) == 4
-            spec_prompt_logprobs = spec_output[3]
-            baseline_prompt_logprobs = baseline_output[3]
-            _check_logprobs_when_output_disabled(spec_prompt_logprobs,
-                                                 baseline_prompt_logprobs,
-                                                 is_prompt_logprobs=True)
-def _check_logprobs_when_output_disabled(
-    spec_logprobs: Union[Optional[PromptLogprobs], SampleLogprobs],
-    baseline_logprobs: Union[Optional[PromptLogprobs], SampleLogprobs],
-    is_prompt_logprobs: bool = False,
-):
-    # Prompt logprobs are optional
-    if is_prompt_logprobs and baseline_logprobs is None:
-        assert spec_logprobs is None
-        return
-    assert spec_logprobs is not None
-    assert baseline_logprobs is not None
-    assert len(spec_logprobs) == len(baseline_logprobs)
-    # For each generated position of the sequence.
-    for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
-            zip(spec_logprobs, baseline_logprobs)):
-        # First prompt logprob is expected to be None
-        if is_prompt_logprobs and baseline_pos_logprobs is None:
-            assert spec_pos_logprobs is None
-            assert pos == 0
-            continue
-        assert spec_pos_logprobs is not None
-        assert baseline_pos_logprobs is not None
-        # When disabled, the 1 logprob is returned with dummy values for the
-        # score and rank, but the token id should match the baseline model
-        assert len(spec_pos_logprobs) == 1
-        (spec_pos_logprob_token_id,
-         spec_pos_logprob) = next(iter(spec_pos_logprobs.items()))
-        assert spec_pos_logprob.rank == -1
-        assert spec_pos_logprob.logprob == 0.0
-        if isinstance(spec_pos_logprob_token_id, torch.Tensor):
-            spec_pos_logprob_token_id = spec_pos_logprob_token_id.item()
-        assert spec_pos_logprob_token_id in baseline_pos_logprobs
-def run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size: int,
-        max_output_len: int,
-        seed: Optional[int] = 0,
-        temperature: float = 0.0,
-        disable_seed: bool = False,
-        ignore_eos: bool = True,
-        ensure_all_accepted: bool = False,
-        expected_acceptance_rate: Optional[float] = None,
-        logprobs: Optional[int] = None,
-        prompt_logprobs: Optional[int] = None,
-        disable_logprobs: bool = False):
-    org_args = {
-        **common_llm_kwargs,
-        **per_test_common_llm_kwargs,
-        **baseline_llm_kwargs,
-    }
-    sd_args = {
-        **common_llm_kwargs,
-        **per_test_common_llm_kwargs,
-        **test_llm_kwargs,
-    }
-    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
-    if disable_seed:
-        seed = None
-    sampling_params = SamplingParams(temperature=temperature,
-                                     max_tokens=max_output_len,
-                                     seed=seed,
-                                     ignore_eos=ignore_eos,
-                                     logprobs=logprobs,
-                                     prompt_logprobs=prompt_logprobs)
-    with vllm_runner(**org_args) as vllm_model:
-        org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
-    with vllm_runner(**sd_args) as vllm_model:
-        if ensure_all_accepted or expected_acceptance_rate is not None:
-            # Force log interval to be 0 to catch all metrics.
-            stat_logger = vllm_model.model.llm_engine.stat_loggers[
-                'prometheus']
-            stat_logger.local_interval = -100
-        sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
-        if ensure_all_accepted or expected_acceptance_rate is not None:
-            acceptance_rate = (stat_logger.metrics.
-                               gauge_spec_decode_draft_acceptance_rate.labels(
-                                   **stat_logger.labels)._value.get())
-            if ensure_all_accepted:
-                assert True
-                # FIXME: ci fails to log acceptance rate.
-                # It works locally.
-                # assert acceptance_rate == 1.0
-            if expected_acceptance_rate is not None:
-                assert acceptance_rate >= expected_acceptance_rate - 1e-2
-    # Only pass token entries, not the logprobs
-    check_outputs_equal(outputs_0_lst=[out[0:2] for out in org_outputs],
-                        outputs_1_lst=[out[0:2] for out in sd_outputs],
-                        name_0="org",
-                        name_1="sd")
-    # Check logprobs if requested
-    if logprobs is not None or prompt_logprobs is not None:
-        check_logprobs_correctness(sd_outputs, org_outputs, disable_logprobs)
-def run_equality_correctness_test_tp(model,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size: int,
-                                     max_output_len: int,
-                                     seed: int = 0,
-                                     temperature: float = 0.0,
-                                     logprobs: Optional[int] = None):
-    """Helper method that compares the outputs of both the baseline LLM and
-    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
-    the same when temperature is zero.
-    """
-    arg1 = common_llm_kwargs + per_test_common_llm_kwargs + baseline_llm_kwargs
-    arg2 = common_llm_kwargs + per_test_common_llm_kwargs + test_llm_kwargs
-    env1 = env2 = None
-    max_wait_seconds = 240
-    results = []
-    prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))]
-    for args, env in ((arg1, env1), (arg2, env2)):
-        with RemoteOpenAIServer(model,
-                                args,
-                                env_dict=env,
-                                max_wait_seconds=max_wait_seconds) as server:
-            client = server.get_client()
-            completion = client.completions.create(model=model,
-                                                   prompt=prompts,
-                                                   max_tokens=max_output_len,
-                                                   seed=seed,
-                                                   temperature=temperature,
-                                                   logprobs=logprobs)
-            results.append({
-                "test":
-                "seeded_sampling",
-                "text": [choice.text for choice in completion.choices],
-                "logprobs": [choice.logprobs for choice in completion.choices],
-                "finish_reason":
-                [choice.finish_reason for choice in completion.choices],
-                "usage":
-                completion.usage,
-            })
-    n = len(results) // 2
-    arg1_results = results[:n]
-    arg2_results = results[n:]
-    # Separate logprobs to avoid asserting exact equality.
-    arg1_logprobs = [r.pop("logprobs") for r in arg1_results]
-    arg2_logprobs = [r.pop("logprobs") for r in arg2_results]
-    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
-        assert arg1_result == arg2_result, (
-            f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
-            f"{arg1_result=} != {arg2_result=}")
-    if logprobs:
-        for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs):
-            for l1, l2 in zip(logs1, logs2):
-                assert l1.tokens == l2.tokens
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-from vllm import SamplingParams
-from .conftest import get_output_from_llm_generator
-@pytest.mark.parametrize("common_llm_kwargs",
-                         [{
-                             "model": "meta-llama/Llama-3.2-1B-Instruct",
-                         }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            # Speculative max model len > overridden max model len should raise.
-            "speculative_config": {
-                "model": "JackFram/llama-68m",
-                "num_speculative_tokens": 5,
-                "max_model_len": 129,
-            },
-            "max_model_len": 128,
-        },
-        {
-            # Speculative max model len > draft max model len should raise.
-            # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
-            "speculative_config": {
-                "model": "JackFram/llama-68m",
-                "num_speculative_tokens": 5,
-                "max_model_len": 2048 + 1,
-            },
-        },
-        {
-            # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
-            "speculative_config": {
-                "model": "JackFram/llama-68m",
-                "num_speculative_tokens": 5,
-                "max_model_len": 131072 + 1,
-            },
-        },
-    ])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
-    """Verify that speculative decoding validates speculative_max_model_len.
-    """
-    output_len = 128
-    temperature = 0.0
-    prompts = [
-        "Hello, my name is",
-    ]
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-    with pytest.raises(ValueError, match="cannot be larger than"):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""This docstring details important information on the testing methodology.
-Most of the tests rely on "greedy equality", where we expect the output of
-speculative decoding on a sequence to exactly match the output of normal non-
-speculative decoding.
-Since speculative decoding with rejection sampling guarantees that the output
-distribution matches the target model's output distribution (up to hardware
-numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
-equality.
-However, we still need to verify below scenario could be passed:
-    * Batch size 1 greedy equality
-    * Batch size >1 greedy equality
-    * Test greedy equality under preemption
-    * Test greedy equality under various number of speculative tokens.
-With those tests, we can say at least, EAGLE would not break the
-correctness for the target model outputs.
-"""
-import pytest
-from .conftest import run_equality_correctness_test
-# main model
-MAIN_MODEL = "JackFram/llama-68m"
-# speculative model
-SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
-# max. number of speculative tokens: this corresponds to
-# num_heads in the config.json of the speculator model.
-MAX_SPEC_TOKENS = 4
-# precision
-PRECISION = "float32"
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # Print spec metrics.
-        "disable_log_stats": False,
-        # Precision
-        "dtype": PRECISION,
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                      per_test_common_llm_kwargs,
-                                      baseline_llm_kwargs, test_llm_kwargs,
-                                      batch_size: int, output_len: int,
-                                      seed: int):
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # Print spec metrics.
-        "disable_log_stats": False,
-        # Precision
-        "dtype": PRECISION,
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs": False,
-    },
-}, {
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs": True,
-    },
-}])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("logprobs", [1, 6])
-def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
-                                   per_test_common_llm_kwargs,
-                                   baseline_llm_kwargs, test_llm_kwargs,
-                                   batch_size: int, output_len: int, seed: int,
-                                   logprobs: int):
-    run_equality_correctness_test(
-        vllm_runner,
-        common_llm_kwargs,
-        per_test_common_llm_kwargs,
-        baseline_llm_kwargs,
-        test_llm_kwargs,
-        batch_size,
-        output_len,
-        seed,
-        logprobs=logprobs,
-        prompt_logprobs=logprobs,
-        disable_logprobs=test_llm_kwargs["speculative_config"]
-        ["disable_logprobs"])
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "enforce_eager": False,
-        # Print spec metrics.
-        "disable_log_stats": False,
-        # Precision
-        "dtype": PRECISION,
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize("output_len", [
-    128,
-])
-@pytest.mark.parametrize("batch_size", [1, 32])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness_cuda_graph(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
-    """Verify greedy equality with cuda graph enabled and different
-    batch sizes."""
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "block_size": 8,
-        # 2 for small prompt, 256//8 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 8,
-        "max_model_len": (2 + 256 // 8) * 8,
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # Precision
-        "dtype": PRECISION,
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": SPEC_MODEL,
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use small output len for fast test.
-        128,
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_e2e_greedy_correctness_with_preemption(
-        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
-        seed: int):
-    """Verify greedy equality, even when some sequences are preempted mid-
-    generation.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # Precision
-        "dtype": PRECISION,
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            "speculative_config": {
-                "model": SPEC_MODEL,
-                "num_speculative_tokens": k,
-            },
-        }
-        # Try a range of num. speculative tokens
-        for k in range(1, 1 + MAX_SPEC_TOKENS)
-    ])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_different_k(vllm_runner, common_llm_kwargs,
-                           per_test_common_llm_kwargs, baseline_llm_kwargs,
-                           test_llm_kwargs, batch_size: int, output_len: int,
-                           seed: int):
-    """Verify that eagle speculative decoding produces exact equality
-    to without spec decode with different values of num_speculative_tokens.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # Precision
-        "dtype": PRECISION,
-        # Main model
-        "model_name": MAIN_MODEL,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_by_batch_size": 4,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
-                             per_test_common_llm_kwargs, baseline_llm_kwargs,
-                             test_llm_kwargs, batch_size: int, output_len: int,
-                             seed: int):
-    """Verify that eagle speculative decoding produces exact equality
-    to without spec decode when speculation is disabled for large
-    batch sizes.
-    """
-    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs, test_llm_kwargs,
-                                  batch_size, output_len, seed)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # Print spec metrics.
-        "disable_log_stats": False,
-        # Precision
-        "dtype": "float16",
-        # Main model
-        "model_name": "meta-llama/Llama-2-7b-chat-hf",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "yuhuili/EAGLE-llama2-chat-7B",
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize("seed", [1])
-def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                             per_test_common_llm_kwargs,
-                                             baseline_llm_kwargs,
-                                             test_llm_kwargs, batch_size: int,
-                                             output_len: int, seed: int):
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # 2 for small prompt, 256//16 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 16,
-        "max_model_len": (2 + 256 // 16) * 16,
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # Print spec metrics.
-        "disable_log_stats": False,
-        # Precision
-        "dtype": "float16",
-        # Main model
-        "model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize("seed", [1])
-def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                             per_test_common_llm_kwargs,
-                                             baseline_llm_kwargs,
-                                             test_llm_kwargs, batch_size: int,
-                                             output_len: int, seed: int):
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # 2 for small prompt, 256//16 for generated.
-        "num_gpu_blocks_override": 2 + 256 // 16,
-        "max_model_len": (2 + 256 // 16) * 16,
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # Print spec metrics.
-        "disable_log_stats": False,
-        # Precision
-        "dtype": "float16",
-        # Main model
-        "model_name": "Qwen/Qwen2-7B-Instruct",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_config": {
-            "model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
-            "num_speculative_tokens": MAX_SPEC_TOKENS,
-        },
-    },
-])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize("seed", [1])
-def test_qwen2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            output_len: int, seed: int):
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0)
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests which cover integration of the speculative decoding framework with
-other features, e.g. cuda graphs.
-"""
-import pytest
-from .conftest import run_equality_correctness_test
-MAIN_MODEL = "JackFram/llama-68m"
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-68m",
-        # Verify equality when cuda graphs allowed.
-        "enforce_eager": False,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            # Identical models.
-            "speculative_config": {
-                "model": "JackFram/llama-68m",
-                "num_speculative_tokens": 5,
-            },
-        },
-    ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize("output_len", [32])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
-                                per_test_common_llm_kwargs,
-                                baseline_llm_kwargs, test_llm_kwargs,
-                                batch_size: int, output_len: int, seed: int):
-    """Verify spec decode equality when cuda graphs are enabled.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": "JackFram/llama-160m",
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        # Explicitly specify draft model quantization
-        {
-            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
-                "num_speculative_tokens": 5,
-                "quantization": "gptq",
-            },
-        },
-        # Explicitly specify GPTQ-based draft model to use marlin quantization
-        {
-            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
-                "num_speculative_tokens": 5,
-                "quantization": "marlin",
-            },
-        },
-        # Not explicitly specify draft model quantization
-        {
-            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
-                "num_speculative_tokens": 5,
-                "quantization": None,
-            },
-        },
-    ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
-                                               per_test_common_llm_kwargs,
-                                               baseline_llm_kwargs,
-                                               test_llm_kwargs,
-                                               batch_size: int, seed: int):
-    """Verify spec decode works well with draft model quantization configs.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=32,
-                                  seed=seed,
-                                  temperature=0.0)
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model_name": MAIN_MODEL,
-        # Skip cuda graph recording for fast test.
-        "enforce_eager": True,
-        # The original model is float32, keep it for numerical stability.
-        "dtype": "float32",
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "speculative_config": {
-        "model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
-        "disable_mqa_scorer": True,
-    },
-}])
-@pytest.mark.parametrize("batch_size", [1, 5])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
-                    baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
-                    output_len: int, seed: int):
-    """Verify that speculative decoding generates the same output
-    with batch expansion scorer and mqa scorer.
-    """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0)
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests which cover integration of the speculative decoding framework with
-tensor parallelism.
-"""
-import json
-from typing import Optional
-import pytest
-import torch
-from vllm.platforms import current_platform
-from .conftest import run_equality_correctness_test_tp
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor-parallel-size",
-        "2"
-    ]])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
-@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    [
-        "--speculative_config",
-        json.dumps({
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 3,
-        }),
-    ],
-    [
-        "--speculative_config",
-        json.dumps({
-            "model": "ngram",
-            "num_speculative_tokens": 5,
-            "prompt_lookup_max": 3,
-        }),
-    ],
-])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # Use smaller output len for fast test.
-        32,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
-                              baseline_llm_kwargs, test_llm_kwargs,
-                              batch_size: int, output_len: int, seed: int):
-    """Verify greedy equality when tensor parallelism is used.
-    """
-    if current_platform.is_rocm():
-        pytest.skip("hip is not well-supported yet")
-    run_equality_correctness_test_tp("JackFram/llama-68m",
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     output_len,
-                                     seed,
-                                     temperature=0.0)
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor_parallel_size",
-        "2",
-        # precision
-        "--dtype",
-        "bfloat16",
-    ]])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
-@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize(
-    "model, test_llm_kwargs",
-    [("JackFram/llama-68m", [
-        "--speculative_config",
-        json.dumps({
-            "model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
-            "draft_tensor_parallel_size": 1,
-        }),
-    ]),
-     ("ibm-granite/granite-3b-code-instruct", [
-         "--speculative_config",
-         json.dumps({
-             "model": "ibm-granite/granite-3b-code-instruct",
-             "num_speculative_tokens": 5,
-             "draft_tensor_parallel_size": 1,
-         }),
-     ])])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            seed: int):
-    """Verify spec decode works well with smaller tp for draft models.
-    """
-    run_equality_correctness_test_tp(model,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0)
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor_parallel_size",
-        "2",
-        # precision
-        "--dtype",
-        "bfloat16",
-    ]])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [["--enable-chunked-prefill", "False"],
-     [
-         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
-         "--max-num-seqs", "4"
-     ]])
-@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("model, test_llm_kwargs",
-                         [("JackFram/llama-68m", [
-                             "--speculative_config",
-                             json.dumps({
-                                 "model": "JackFram/llama-68m",
-                                 "num_speculative_tokens": 3,
-                             }),
-                         ]),
-                          ("JackFram/llama-68m", [
-                              "--speculative_config",
-                              json.dumps({
-                                  "model": "JackFram/llama-68m",
-                                  "num_speculative_tokens": 3,
-                                  "draft_tensor_parallel_size": 1,
-                              }),
-                          ])])
-@pytest.mark.parametrize("logprobs", [None])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
-                                         per_test_common_llm_kwargs,
-                                         baseline_llm_kwargs, test_llm_kwargs,
-                                         logprobs: Optional[int],
-                                         batch_size: int, seed: int):
-    """Verify spec decode works well with same and different TP size for
-    the draft model with chunked prefill.
-    """
-    run_equality_correctness_test_tp(model,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0,
-                                     logprobs=logprobs)
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor_parallel_size",
-        "2",
-        # precision
-        "--dtype",
-        "bfloat16",
-    ]])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [["--enable-chunked-prefill", "False"],
-     [
-         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
-         "--max-num-seqs", "4"
-     ]])
-@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("model, test_llm_kwargs",
-                         [("JackFram/llama-68m", [
-                             "--speculative_config",
-                             json.dumps({
-                                 "model": "JackFram/llama-68m",
-                                 "num_speculative_tokens": 3,
-                                 "disable_logprobs": False,
-                             }),
-                         ]),
-                          ("JackFram/llama-68m", [
-                              "--speculative_config",
-                              json.dumps({
-                                  "model": "JackFram/llama-68m",
-                                  "num_speculative_tokens": 3,
-                                  "draft_tensor_parallel_size": 1,
-                                  "disable_logprobs": False,
-                              }),
-                          ])])
-@pytest.mark.parametrize("logprobs", [2])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_chunked_prefill_tp2_with_logprobs(
-        model, common_llm_kwargs, per_test_common_llm_kwargs,
-        baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
-        batch_size: int, seed: int):
-    """Verify spec decode works well with same and different TP size for
-    the draft model with chunked prefill.
-    """
-    run_equality_correctness_test_tp(model,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0,
-                                     logprobs=logprobs)
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests which cover integration of the speculative decoding framework with
-tensor parallelism.
-"""
-import json
-import openai
-import pytest
-import torch
-from .conftest import run_equality_correctness_test_tp
-MAIN_MODEL = "JackFram/llama-68m"
-SPEC_MODEL = "JackFram/llama-68m"
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce_eager",
-        "--tensor-parallel-size",
-        "4",
-    ]])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    [],
-])
-@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        #TODO(wooyeon): add spec_draft_dp=2 case
-        [
-            "--speculative_config",
-            json.dumps({
-                "model": f"{SPEC_MODEL}",
-                "num_speculative_tokens": 5,
-                "draft_tensor_parallel_size": 1,
-            }),
-        ],
-    ])
-@pytest.mark.parametrize("batch_size", [2])
-@pytest.mark.parametrize("seed", [1])
-def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
-                                            per_test_common_llm_kwargs,
-                                            baseline_llm_kwargs,
-                                            test_llm_kwargs, batch_size: int,
-                                            seed: int):
-    """Verify spec decode works well with smaller tp for draft models.
-    """
-    run_equality_correctness_test_tp(MAIN_MODEL,
-                                     common_llm_kwargs,
-                                     per_test_common_llm_kwargs,
-                                     baseline_llm_kwargs,
-                                     test_llm_kwargs,
-                                     batch_size,
-                                     max_output_len=32,
-                                     seed=seed,
-                                     temperature=0.0)
-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [[
-        # Skip cuda graph recording for fast test.
-        "--enforce-eager",
-        "--tensor-parallel-size",
-        "4",
-    ]])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
-@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        [
-            # Artificially limit the draft model max model len; this forces vLLM
-            # to skip speculation once the sequences grow beyond 32-k tokens.
-            "--speculative_config",
-            json.dumps({
-                "model": f"{SPEC_MODEL}",
-                "num_speculative_tokens": 5,
-                "max_model_len": 32,
-            }),
-        ],
-    ])
-@pytest.mark.parametrize("batch_size", [8])
-@pytest.mark.parametrize(
-    "output_len",
-    [
-        # This must be a good bit larger than speculative_max_model_len so that
-        # we can test the case where all seqs are skipped, but still small to
-        # ensure fast test.
-        64,
-    ])
-@pytest.mark.parametrize("seed", [1])
-def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs,
-                          baseline_llm_kwargs, test_llm_kwargs,
-                          batch_size: int, output_len: int, seed: int):
-    """Verify job failure with RuntimeError when all sequences skip speculation.
-    We do this by setting the max model len of the draft model to an
-    artificially low value, such that when the sequences grow beyond it, they
-    are skipped in speculative decoding.
-    TODO: fix it to pass without raising Error. (#5814)
-    """
-    with pytest.raises(
-        (openai.APIConnectionError, openai.InternalServerError)):
-        run_equality_correctness_test_tp(MAIN_MODEL,
-                                         common_llm_kwargs,
-                                         per_test_common_llm_kwargs,
-                                         baseline_llm_kwargs,
-                                         test_llm_kwargs,
-                                         batch_size,
-                                         output_len,
-                                         seed,
-                                         temperature=0.0)