Merge tag 'v0.5.5' into v0.5.5-dtk24.04.1

af7f4372 · zhuwenwen · 5e19cdef · 09c77926 · af7f4372 · af7f4372
Commit af7f4372 authored Sep 03, 2024 by zhuwenwen
20 changed files
--- a/tests/multi_step/__init__.py
+++ b/tests/multi_step/__init__.py
--- a/tests/multi_step/test_correctness.py
+++ b/tests/multi_step/test_correctness.py
+# Test the AsyncLLMEngine with multi-step-decoding
+
+from typing import List
+
+import pytest
+
+from ..utils import RemoteOpenAIServer
+
+MODELS = [
+    "JackFram/llama-160m",
+]
+NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
+NUM_PROMPTS = [10]
+
+DEFAULT_SERVER_ARGS: List[str] = [
+    "--disable-log-requests",
+    "--use-v2-block-manager",
+    "--worker-use-ray",
+    "--gpu-memory-utilization",
+    "0.85",
+    "--swap-space",
+    "16",
+]
+
+
+async def completions_with_server_args(prompts: List[str], model_name: str,
+                                       server_cli_args: List[str]):
+
+    outputs = None
+    with RemoteOpenAIServer(model_name, server_cli_args) as server:
+        client = server.get_async_client()
+        outputs = await client.completions.create(model=model_name,
+                                                  prompt=prompts,
+                                                  temperature=0,
+                                                  stream=False,
+                                                  max_tokens=5)
+    assert outputs is not None
+
+    return outputs
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(("tp_size, pp_size"), [
+    (1, 1),
+    (2, 2),
+])
+@pytest.mark.parametrize("eager_mode", [False, True])
+@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
+@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
+@pytest.mark.asyncio
+async def test_multi_step(example_prompts, model: str, tp_size: int,
+                          pp_size: int, eager_mode: int,
+                          num_scheduler_steps: int, num_prompts: int):
+
+    prompts = example_prompts
+    if len(prompts) < num_prompts:
+        prompts = prompts * ((num_prompts // len(prompts)) + 1)
+    prompts = prompts[:num_prompts]
+    assert len(prompts) == num_prompts
+
+    server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+    ms_server_args = DEFAULT_SERVER_ARGS + \
+        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+
+    if eager_mode:
+        ms_server_args.append("--enforce-eager")
+
+    distributed_args = [
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--pipeline-parallel-size",
+        str(pp_size),
+    ]
+
+    ref_completions = await completions_with_server_args(
+        prompts, model, server_args + distributed_args)
+    test_completions = await completions_with_server_args(
+        prompts, model, ms_server_args + distributed_args)
+
+    def get_text_generations(completions):
+        return [x.text for x in completions.choices]
+
+    ref_generations = get_text_generations(ref_completions)
+    test_generations = get_text_generations(test_completions)
+    assert ref_generations == test_generations
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
+from contextlib import nullcontext
+
 import numpy as np
 import pytest
 from transformers import CLIPImageProcessor, LlavaNextImageProcessor

 from vllm.config import ModelConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.utils import rescale_image_size


+@pytest.fixture
+def mm_registry():
+    return MultiModalRegistry()
+
+
 @pytest.mark.parametrize("dtype", ["half", "float"])
 @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
-def test_clip_image_processor(image_assets, dtype, size_factor):
+def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"

    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
@@ -23,8 +30,11 @@ def test_clip_image_processor(image_assets, dtype, size_factor):
        seed=0,
        dtype=dtype,
        revision=None,
+        limit_mm_per_prompt={"image": 1},
    )

+    mm_registry.init_mm_limits_per_prompt(model_config)
+
    for asset in image_assets:
        image = rescale_image_size(asset.pil_image, size_factor)

@@ -32,7 +42,7 @@ def test_clip_image_processor(image_assets, dtype, size_factor):
            image,
            return_tensors="pt",
        )
-        vllm_result = MULTIMODAL_REGISTRY.map_input(
+        vllm_result = mm_registry.map_input(
            model_config,
            {"image": image},
        )
@@ -48,7 +58,8 @@ def test_clip_image_processor(image_assets, dtype, size_factor):

 @pytest.mark.parametrize("dtype", ["half", "float"])
 @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
-def test_llava_next_image_processor(image_assets, dtype, size_factor):
+def test_llava_next_image_processor(image_assets, mm_registry, dtype,
+                                    size_factor):
    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"

    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
@@ -62,8 +73,11 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):
        seed=0,
        dtype=dtype,
        revision=None,
+        limit_mm_per_prompt={"image": 1},
    )

+    mm_registry.init_mm_limits_per_prompt(model_config)
+
    for asset in image_assets:
        image = rescale_image_size(asset.pil_image, size_factor)

@@ -71,7 +85,7 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):
            image,
            return_tensors="pt",
        )
-        vllm_result = MULTIMODAL_REGISTRY.map_input(
+        vllm_result = mm_registry.map_input(
            model_config,
            {"image": image},
        )
@@ -83,3 +97,61 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):

            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+
+
+@pytest.mark.parametrize(
+    ("num_images", "limit", "is_valid"),
+    [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
+     (2, 1, False), (2, 2, True)],
+)
+def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+        limit_mm_per_prompt={"image": limit},
+    )
+
+    mm_registry.init_mm_limits_per_prompt(model_config)
+
+    image = image_assets[0].pil_image
+    if num_images == 0:
+        mm_inputs = {}
+    elif num_images == 1:
+        mm_inputs = {"image": image}
+    else:
+        mm_inputs = {"image": [image] * num_images}
+
+    with nullcontext() if is_valid else pytest.raises(ValueError):
+        mm_registry.map_input(model_config, mm_inputs)
+
+
+# NOTE: We don't test zero images since the HF processor doesn't support it
+@pytest.mark.parametrize("num_images", [1, 2])
+def test_image_mapper_multi(image_assets, mm_registry, num_images):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="half",
+        revision=None,
+        limit_mm_per_prompt={"image": num_images},
+    )
+
+    mm_registry.init_mm_limits_per_prompt(model_config)
+
+    image = image_assets[0].pil_image
+    mm_inputs = {"image": [image] * num_images}
+
+    mapped_inputs = mm_registry.map_input(model_config, mm_inputs)
+    assert len(mapped_inputs["pixel_values"]) == num_images
--- a/tests/plugins/vllm_add_dummy_model/setup.py
+++ b/tests/plugins/vllm_add_dummy_model/setup.py
+from setuptools import setup
+
+setup(name='vllm_add_dummy_model',
+      version='0.1',
+      packages=['vllm_add_dummy_model'],
+      entry_points={
+          'vllm.general_plugins':
+          ["register_dummy_model = vllm_add_dummy_model:register"]
+      })
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py
+from typing import Optional
+
+import torch
+
+from vllm import ModelRegistry
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+
+class MyOPTForCausalLM(OPTForCausalLM):
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        if logits is not None:
+            logits.zero_()
+            logits[:, 0] += 1.0
+        return logits
+
+
+def register():
+    # register our dummy model
+    if "MyOPTForCausalLM" not in ModelRegistry.get_supported_archs():
+        ModelRegistry.register_model("MyOPTForCausalLM", MyOPTForCausalLM)
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -34,6 +34,9 @@ def test_block_allocator(
    assert (first_block == second_block)
    assert (second_block.ref_count == 2)

+    # Check metric: 1 hit of 2 queries
+    assert block_allocator.get_prefix_cache_hit_rate() == 0.5
+
    # Free the first_block and confirm that the ref_count is correctly
    # decremented on the second block
    block_allocator.free(first_block)
@@ -48,6 +51,10 @@ def test_block_allocator(
    assert (first_block == second_block)
    assert (first_block.block_hash == block_hash)

+    # Allocate one more time to get 3/4 hit rate for easy checking
+    block_allocator.allocate(block_hash, 0)
+    assert block_allocator.get_prefix_cache_hit_rate() == 0.75
+

 @pytest.mark.parametrize("num_blocks", [16])
 def test_eviction(num_blocks: int, ):

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -9,7 +9,7 @@ import torch
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsWNA16)
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
    QuantizationType)

@@ -109,7 +109,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):

        assert qkv_proj.weight_packed.dtype is torch.int32
        assert qkv_proj.weight_scale.dtype is torch.float16
-        assert qkv_proj.weight_packed.pack_factor == pack_factor
+        assert qkv_proj.scheme.pack_factor == pack_factor

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        assert output
@@ -140,12 +140,16 @@ def test_compressed_tensors_fp8(vllm_runner):
        qkv_proj = layer.self_attn.qkv_proj

        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
-        assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+        assert isinstance(
+            qkv_proj.scheme,
+            (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
+
        assert qkv_proj.input_scale.dtype is torch.float32
-        assert qkv_proj.weight_scale.dtype is torch.float32
-        # should be scalars after processing
+
+        if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
            assert len(qkv_proj.input_scale.shape) == 0
+            assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+            assert qkv_proj.weight_scale.dtype is torch.float32
            assert len(qkv_proj.weight_scale.shape) == 0

        output = llm.generate_greedy("Hello my name is", max_tokens=20)

--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
+# Expanded quantized model tests for CPU offloading
+# Base tests: tests/basic_correctness/test_cpu_offload.py
+
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+from ..utils import compare_two_settings
+
+
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
+                    reason="fp8 is not supported on this GPU type.")
+def test_cpu_offload_fp8():
+    # Test quantization of an unquantized checkpoint
+    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
+                         ["--quantization", "fp8"],
+                         ["--quantization", "fp8", "--cpu-offload-gb", "2"],
+                         max_wait_seconds=480)
+    # Test loading a quantized checkpoint
+    compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
+                         ["--cpu-offload-gb", "2"],
+                         max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_gptq():
+    # Test GPTQ Marlin
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test GPTQ
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+                         ["--quantization", "gptq"],
+                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
+                    reason="awq_marlin is not supported on this GPU type.")
+def test_cpu_offload_awq():
+    # Test AWQ Marlin
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test AWQ
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
+                         ["--quantization", "awq"],
+                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+
+
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+def test_cpu_offload_compressed_tensors():
+    # Test wNa16
+    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
+                         ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test w4a16_marlin24
+    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+                         [], ["--cpu-offload-gb", "1"],
+                         max_wait_seconds=480)
+    # Test w8a8
+    compare_two_settings(
+        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
+        ["--cpu-offload-gb", "1"],
+        max_wait_seconds=480)
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
+# flake8: noqa
+"""Tests experts_int8 quantization startup and generation, 
+doesn't test correctness
+"""
+import pytest
+
+from tests.quantization.utils import is_quant_method_supported
+
+MODELS = ["ai21labs/Jamba-tiny-random"]
+
+
+@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
+                    reason="ExpertsInt8 is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_model_experts_int8_startup(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+
+    with vllm_runner(model, dtype=dtype,
+                     quantization="experts_int8") as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -9,6 +9,7 @@ from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
                                                         Fp8LinearMethod)
+from vllm.platforms import current_platform

 MODELS = [
    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
@@ -20,7 +21,12 @@ MODELS = [
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", MODELS)
-def test_model_load_and_run(vllm_runner, model_id: str):
+@pytest.mark.parametrize("force_marlin", [False, True])
+def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
+                            monkeypatch) -> None:
+    if force_marlin:
+        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
+
    with vllm_runner(model_id) as llm:
        # note: this does not test accuracy, just that we can run through
        # see lm-eval tests for accuracy
@@ -61,7 +67,12 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None:
+@pytest.mark.parametrize("force_marlin", [False, True])
+def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
+                         monkeypatch) -> None:
+    if force_marlin:
+        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
+
    with vllm_runner("facebook/opt-125m",
                     quantization="fp8",
                     kv_cache_dtype=kv_cache_dtype) as llm:
@@ -75,9 +86,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None:
            assert attn._k_scale == 1.0
            assert attn._v_scale == 1.0

-        capability = torch.cuda.get_device_capability()
+        capability = current_platform.get_device_capability()
        capability = capability[0] * 10 + capability[1]
-        if capability >= 89:
+        if capability >= 89 and not force_marlin:
            # For GPUs with hardware support, we keep weights in fp8
            assert fc1.weight.dtype == torch.float8_e4m3fn
        else:
@@ -116,16 +127,18 @@ def test_scaled_fp8_quant(dtype) -> None:

    # Reference dynamic quantizaton
    y = quantize_ref(x, inv_scale)
-    assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
+    torch.testing.assert_close(ref_y,
+                               per_tensor_dequantize(y, inv_scale, dtype))

    # Static quantization
    y, _ = ops.scaled_fp8_quant(x, inv_scale)
-    assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
+    torch.testing.assert_close(ref_y,
+                               per_tensor_dequantize(y, inv_scale, dtype))

    # Padding
    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
    assert y.shape[0] == 17
-    assert torch.allclose(
+    torch.testing.assert_close(
        ref_y,
        per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
                              dtype))
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -7,11 +7,12 @@ from typing import Tuple
 import pytest
 import torch

-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.gptq_marlin import (
    GPTQMarlinLinearMethod)
 from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod)

 PROMPT = "On the surface of Mars, we found"

@@ -37,7 +38,8 @@ def test_lm_head(
            lm_head_layer.linear_method,
            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
    else:
-        assert isinstance(lm_head_layer.linear_method, UnquantizedLinearMethod)
+        assert isinstance(lm_head_layer.linear_method,
+                          UnquantizedEmbeddingMethod)

    print(
        vllm_model.generate_greedy(prompts=["Hello my name is"],

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -25,7 +25,7 @@ def mock_causal_accepted_tensor(

    accepted = (torch.arange(k).expand(batch_size, k) <=
                last_accepted_indices.unsqueeze(-1).broadcast_to(
-                    batch_size, k)).to(device="cuda")
+                    batch_size, k))

    # Sprinkle accepted values after the contiguous initial accepted values.
    # This replicates the behavior of rejection sampling, which may "accept"
@@ -33,7 +33,7 @@ def mock_causal_accepted_tensor(
    sprinkle_candidates = (
        torch.arange(k).expand(batch_size, k) >
        last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
-    sprinkle = torch.rand(batch_size, k, device="cuda") > 0.5
+    sprinkle = torch.rand(batch_size, k) > 0.5
    accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
    return accepted

@@ -86,7 +86,7 @@ def test_correct_output_format(which_tokens_accepted: str,

    rejection_sampler = RejectionSampler(
        disable_bonus_tokens=disable_bonus_tokens)
-    rejection_sampler.init_gpu_tensors(rank=0)
+    rejection_sampler.init_gpu_tensors(device=device)
    output_token_ids = rejection_sampler._create_output(  # pylint: disable=protected-access
        accepted,
        recovered_token_ids,
@@ -138,7 +138,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                    device: str):
    torch.set_default_device(device)
    rejection_sampler = RejectionSampler()
-    rejection_sampler.init_gpu_tensors(rank=0)
+    rejection_sampler.init_gpu_tensors(device=device)

    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
@@ -167,7 +167,7 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                                   device: str):
    torch.set_default_device(device)
    rejection_sampler = RejectionSampler()
-    rejection_sampler.init_gpu_tensors(rank=0)
+    rejection_sampler.init_gpu_tensors(device=device)

    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
@@ -211,7 +211,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
    torch.set_default_device(device)

    rejection_sampler = RejectionSampler(strict_mode=True)
-    rejection_sampler.init_gpu_tensors(rank=0)
+    rejection_sampler.init_gpu_tensors(device=device)

    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
@@ -339,7 +339,7 @@ class _CorrectnessTestHelper:
        self.vocab_size = vocab_size
        self.vocab_range = (0, vocab_size)

-        self.rejection_sampler.init_gpu_tensors(rank=0)
+        self.rejection_sampler.init_gpu_tensors(device=0)

        # Keep test simple, use k=1
        self.k = 1

--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
 import itertools
 import random
+from array import array
 from typing import Dict, List, Optional, Tuple
-from unittest.mock import patch
+from unittest.mock import Mock, patch

 import pytest
 import torch
 from transformers import GenerationConfig, GenerationMixin

+import vllm.envs as envs
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams,
+                           SequenceData, SequenceGroupMetadata)
 from vllm.utils import Counter, is_pin_memory_available


@@ -56,7 +59,9 @@ def _do_sample(
            SequenceGroupMetadata(
                request_id=f"test_{i}",
                is_prompt=True,
-                seq_data={0: SequenceData([1, 2, 3])},
+                seq_data={
+                    0: SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [1, 2, 3]))
+                },
                sampling_params=sampling_params,
                block_tables={0: [1]},
            ))
@@ -201,7 +206,8 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):

    def create_sequence_data(num_input=3, num_generated=0):
        seq_data = SequenceData(
-            random.choices(range(0, VOCAB_SIZE), k=num_input))
+            array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                  random.choices(range(0, VOCAB_SIZE), k=num_input)))
        if num_generated > 0:
            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
                                                       k=num_generated)
@@ -504,7 +510,9 @@ def test_sampler_mixed(seed: int, device: str):
            SequenceGroupMetadata(
                request_id=f"test_{i}",
                is_prompt=True,
-                seq_data={0: SequenceData([1, 2, 3])},
+                seq_data={
+                    0: SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [1, 2, 3]))
+                },
                sampling_params=sampling_params,
                block_tables={0: [1]},
            ))
@@ -600,7 +608,9 @@ def test_sampler_top_k_top_p(seed: int, device: str):
            SequenceGroupMetadata(
                request_id=f"test_{i}",
                is_prompt=True,
-                seq_data={0: SequenceData([1, 2, 3])},
+                seq_data={
+                    0: SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [1, 2, 3]))
+                },
                sampling_params=SamplingParams(
                    temperature=1,
                    top_k=top_k,
@@ -625,17 +635,51 @@ def test_sampler_top_k_top_p(seed: int, device: str):
        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
                 for prob in probs], None)

-    with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
+    # top-k and top-p is only calculated when flashinfer kernel is not available
+    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
+         patch("vllm.model_executor.layers.sampler."
+               "flashinfer_top_k_top_p_sampling", None):
        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)

    assert sample_probs is not None

    hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
-    assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
+    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))


+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_flashinfer_fallback(seed: int, device: str):
+    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
+        pytest.skip("Flashinfer sampler is disabled")
+
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    def failing_flashinfer_sampling(*_args, **_kwargs):
+        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    with patch(
+            "vllm.model_executor.layers.sampler."
+            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
+        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                             sampling_params, device)
+
+    assert sampler_output == fallback_sampler_output
+
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_repetition_penalty_mixed(device: str):

@@ -650,7 +694,11 @@ def test_sampler_repetition_penalty_mixed(device: str):
                SequenceGroupMetadata(
                    request_id=f"test_{i}",
                    is_prompt=True,
-                    seq_data={0: SequenceData([1, 2, 3])},
+                    seq_data={
+                        0:
+                        SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE,
+                                           [1, 2, 3]))
+                    },
                    sampling_params=sampling_params[i],
                    block_tables={0: [1]},
                ))
@@ -703,3 +751,28 @@ def test_sampler_repetition_penalty_mixed(device: str):

    assert tokens1[0] == tokens2[1]
    assert tokens1[1] == tokens2[0]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_include_gpu_probs_tensor(device: str):
+    set_random_seed(42)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+    sampler.include_gpu_probs_tensor = True
+    sampler.should_modify_greedy_probs_inplace = False
+
+    sampling_params = SamplingParams(temperature=0)
+
+    mock_inplace = Mock()
+    with patch(
+            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
+            mock_inplace):
+
+        sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                    sampling_params, device)
+        mock_inplace.assert_not_called()
+
+    assert sampler_output.sampled_token_probs is not None
+    assert sampler_output.logprobs is not None
+    assert sampler_output.sampled_token_ids is not None
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -78,7 +78,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
    """
    torch.set_default_device(device)
    typical_acceptance_sampler = get_acceptance_sampler()
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    bonus_token_ids = torch.randint(low=0,
                                    high=vocab_size,
@@ -111,7 +111,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
    vocab_size = 30_000
    torch.set_default_device(device)
    typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True)
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    bonus_token_ids = torch.randint(low=0,
                                    high=vocab_size,
@@ -171,7 +171,7 @@ def test_uniform_target_distribution_accepts_all_tokens(
    torch.set_default_device(device)
    typical_acceptance_sampler = get_acceptance_sampler(
        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    draft_token_ids = torch.randint(low=0,
                                    high=vocab_size,
@@ -225,7 +225,7 @@ def test_temperature_zero_target_distribution(seed: int,

    typical_acceptance_sampler = get_acceptance_sampler(
        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    # Simulate temperature 0 probability distribution for target probabilities
    # and create target probabilities such that only 1 token id has
    # probability 1.0
@@ -285,7 +285,7 @@ def test_mixed_target_distribution(seed: int, disable_bonus_tokens: bool,
    torch.set_default_device(device)
    typical_acceptance_sampler = get_acceptance_sampler(
        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    # For sequences 0 and 2 set the distribution to a temperature
    # zero distribution. For sequences 1 and 3 set it to a uniform
    # distribution.
@@ -352,7 +352,7 @@ def test_accept_tokens_partially(seed: int, disable_bonus_tokens: bool,
    torch.set_default_device(device)
    typical_acceptance_sampler = get_acceptance_sampler(
        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    # Create a temperature zero target probability distribution and ensure
    # all draft token ids correspond to the tokens with 1.0 probability.
    # Verify that all of them are accepted.
@@ -414,7 +414,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
    torch.set_default_device(device)
    typical_acceptance_sampler = get_acceptance_sampler(
        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    # Simulate temperature 0 probability distribution for target
    # probabilities and create target probabilities such that only 1 token
    # id has probability 1.0 and others have a very low probability of
@@ -447,7 +447,7 @@ def test_accept_tokens_set_non_default_posteriors(seed: int,
        disable_bonus_tokens=disable_bonus_tokens,
        posterior_threshold=0.0,
        posterior_alpha=0.0)
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    output_token_ids = typical_acceptance_sampler(
        target_probs,
        bonus_token_ids,
@@ -485,7 +485,7 @@ def test_replacement_token_ids(seed: int, disable_bonus_tokens: bool,
    torch.set_default_device(device)
    typical_acceptance_sampler = get_acceptance_sampler(
        strict_mode=True, disable_bonus_tokens=disable_bonus_tokens)
-    typical_acceptance_sampler.init_gpu_tensors(rank=0)
+    typical_acceptance_sampler.init_gpu_tensors(device=device)
    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    expected_replacement_tokens = -torch.ones(
        (batch_size, k), dtype=torch.long)

--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
 import asyncio
+import os
 from itertools import cycle
 from typing import Dict, List, Optional, Sequence, Tuple, Union

@@ -56,6 +57,11 @@ class AsyncLLM:
    ) -> None:
        if "disable_log_stats" not in kwargs:
            kwargs["disable_log_stats"] = True
+
+        # Needed to engine_use_ray works as a deprecated feature,
+        # otherwise the following constructor will raise an exception
+        os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
+
        engine_args = AsyncEngineArgs(
            model=model,
            tokenizer=tokenizer,
@@ -282,7 +288,8 @@ def run_greedy_equality_correctness_test(baseline_llm_generator,
                                  ensure_all_accepted=ensure_all_accepted)


-def run_equality_correctness_test(baseline_llm_generator,
+def run_equality_correctness_test(
+        baseline_llm_generator,
        test_llm_generator,
        batch_size,
        max_output_len,
@@ -290,7 +297,8 @@ def run_equality_correctness_test(baseline_llm_generator,
        temperature: float,
        seeded: bool,
        print_tokens: bool = False,
-                                  ensure_all_accepted: bool = False):
+        ensure_all_accepted: bool = False,
+        expected_acceptance_rate: Optional[float] = None):
    """Helper method that compares the outputs of both the baseline LLM and
    the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
    the same when temperature is zero (or when temperature is > 0 and seeded).
@@ -351,5 +359,10 @@ def run_equality_correctness_test(baseline_llm_generator,
        print(f'{i=}     {spec_token_ids=}')
        assert baseline_token_ids == spec_token_ids

+    print(f'{acceptance_rate=}')
+
    if ensure_all_accepted:
        assert acceptance_rate == 1.0
+
+    if expected_acceptance_rate is not None:
+        assert acceptance_rate >= expected_acceptance_rate - 1e-2
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, EAGLE would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+# main model
+MAIN_MODEL = "JackFram/llama-68m"
+
+# speculative model
+SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random"
+
+# max. number of speculative tokens: this corresponds to
+# num_heads in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 4
+
+# precision
+PRECISION = "float32"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_e2e_greedy_correctness(baseline_llm_generator,
+                                      test_llm_generator, batch_size: int,
+                                      output_len: int):
+    """Verify greedy equality with different batch size."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "enforce_eager": False,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
+                                                 test_llm_generator,
+                                                 batch_size: int,
+                                                 output_len: int):
+    """Verify greedy equality with cuda graph enabled and different 
+    batch sizes."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+                                                      test_llm_generator,
+                                                      batch_size: int,
+                                                      output_len: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "speculative_model": SPEC_MODEL,
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_different_k(baseline_llm_generator, test_llm_generator,
+                           batch_size: int, output_len: int):
+    """Verify that eagle speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": SPEC_MODEL,
+                             "num_speculative_tokens": MAX_SPEC_TOKENS,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_eagle_disable_queue(baseline_llm_generator, test_llm_generator,
+                             batch_size: int, output_len: int):
+    """Verify that eagle speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -42,3 +42,51 @@ def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
        max_output_len=output_len,
        force_output_len=True,
    )
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        # Explicitly specify draft model quantization
+        {
+            "speculative_model_quantization": "gptq",
+        },
+        # Explicitly specify GPTQ-based draft model to use marlin quantization
+        {
+            "speculative_model_quantization": "marlin",
+        },
+        # Not explicitly specify draft model quantization
+        {
+            "speculative_model_quantization": None,
+        },
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_speculative_model_quantization_config(baseline_llm_generator,
+                                               test_llm_generator,
+                                               batch_size: int):
+    """Verify spec decode works well with draft model quantization configs.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=32,
+                                         force_output_len=True)
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -343,3 +343,78 @@ def run_greedy_logprobs_correctness_test(baseline_llm_generator,
                    b=baseline_rank_to_logprob[rank],
                    abs_tol=1e-1,
                )
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "max_logprobs": 6,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 3,
+                             "disable_logprobs_during_spec_decoding": True,
+                         }])
+@pytest.mark.parametrize("seed", [1])
+def test_logprobs_disabled(baseline_llm_generator, test_llm_generator):
+    """Check the behavior when logprobs are disabled.
+    Token choices should match with the base model.
+    """
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+        "San Francisco is know for its",
+        "Facebook was created in 2004 by",
+        "Curious George is a",
+        "Python 3.11 brings improvements to its",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(4))]
+
+    sampling_params = SamplingParams(
+        # Use smaller output len for fast test
+        max_tokens=7,
+        ignore_eos=True,
+        temperature=0.0,
+        logprobs=2,
+    )
+
+    spec_batch_logprobs = get_logprobs_from_llm_generator(
+        test_llm_generator, prompts, sampling_params)
+    baseline_batch_logprobs = get_logprobs_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    assert len(baseline_batch_logprobs) == len(prompts)
+    assert len(spec_batch_logprobs) == len(prompts)
+
+    # For each sequence in the batch.
+    for _, (baseline_logprobs, spec_logprobs) in enumerate(
+            zip(baseline_batch_logprobs, spec_batch_logprobs)):
+        assert len(spec_logprobs) == len(baseline_logprobs)
+
+        # For each generated position of the sequence.
+        for _, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate(
+                zip(spec_logprobs, baseline_logprobs)):
+
+            assert len(spec_pos_logprobs) == 1
+            spec_top_token_id = list(spec_pos_logprobs)[0]
+
+            spec_top_logprob = spec_pos_logprobs[spec_top_token_id]
+            assert spec_top_logprob.logprob == 0.0
+            assert spec_top_logprob.rank == -1
+
+            # check that the chosen token matches the base model
+            baseline_logprob = baseline_pos_logprobs[spec_top_token_id]
+            assert baseline_logprob.rank == 1
+            assert spec_top_logprob.decoded_token \
+                == baseline_logprob.decoded_token
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -70,8 +70,9 @@ PRECISION = "float32"
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
-                                    batch_size: int, output_len: int):
+def test_medusa_e2e_greedy_correctness(baseline_llm_generator,
+                                       test_llm_generator, batch_size: int,
+                                       output_len: int):
    """Verify greedy equality with different batch size."""
    run_greedy_equality_correctness_test(baseline_llm_generator,
                                         test_llm_generator,
@@ -80,6 +81,49 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
                                         force_output_len=True)


+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "enforce_eager": False,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_medusa_e2e_greedy_correctness_cuda_graph(baseline_llm_generator,
+                                                  test_llm_generator,
+                                                  batch_size: int,
+                                                  output_len: int):
+    """Verify greedy equality with cuda graph enabled and different 
+    batch sizes."""
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
@@ -116,7 +160,7 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
    ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
+def test_medusa_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
                                                       test_llm_generator,
                                                       batch_size: int,
                                                       output_len: int):
@@ -165,9 +209,9 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
        32,
    ])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
+def test_medusa_different_k(baseline_llm_generator, test_llm_generator,
                            batch_size: int, output_len: int):
-    """Verify that mlp speculative decoding produces exact equality
+    """Verify that medusa speculative decoding produces exact equality
    to without spec decode with different values of num_speculative_tokens.
    """
    run_greedy_equality_correctness_test(baseline_llm_generator,
@@ -208,9 +252,9 @@ def test_mlp_different_k(baseline_llm_generator, test_llm_generator,
        32,
    ])
 @pytest.mark.parametrize("seed", [1])
-def test_mlp_disable_queue(baseline_llm_generator, test_llm_generator,
+def test_medusa_disable_queue(baseline_llm_generator, test_llm_generator,
                              batch_size: int, output_len: int):
-    """Verify that mlp speculative decoding produces exact equality
+    """Verify that medusa speculative decoding produces exact equality
    to without spec decode when speculation is disabled for large
    batch sizes.
    """

--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -19,8 +19,12 @@ With those tests, we can say at least, MLPSpeculator would not break the
 correctess for the target model outputs.
 """

+from unittest.mock import patch
+
 import pytest

+from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size
+
 from .conftest import (run_equality_correctness_test,
                       run_greedy_equality_correctness_test)

@@ -78,6 +82,48 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
                                         force_output_len=True)


+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+    },
+])
+@pytest.mark.parametrize("output_len", [2048])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_acceptance_rate(baseline_llm_generator, test_llm_generator,
+                                 batch_size: int, output_len: int):
+    """Verify acceptance rate with different batch size and large output 
+    length."""
+    run_equality_correctness_test(baseline_llm_generator,
+                                  test_llm_generator,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  temperature=0.0,
+                                  seeded=True,
+                                  force_output_len=True,
+                                  expected_acceptance_rate=0.48)
+
+
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
@@ -178,6 +224,62 @@ def test_mlp_e2e_greedy_correctness_with_preemption(baseline_llm_generator,
                                         force_output_len=True)


+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": SPEC_MODEL,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mlp_e2e_greedy_correctness_with_padding(baseline_llm_generator,
+                                                 test_llm_generator,
+                                                 batch_size: int,
+                                                 output_len: int):
+    """Verify greedy equality when the vocab dimension is padded
+    """
+
+    # Default pad_to is 64, test model has vocab_size of 32000
+    def patched_pad_vocab_size(vocab_size, pad_to=None):
+        return pad_vocab_size(vocab_size, pad_to=32064)
+
+    with patch(
+            "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size",
+            patched_pad_vocab_size):
+        run_greedy_equality_correctness_test(baseline_llm_generator,
+                                             test_llm_generator,
+                                             batch_size,
+                                             max_output_len=output_len,
+                                             force_output_len=True)
+
+
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{