merge v0.4.3

b9e12416 · zhuwenwen · e5d707db · e9d3aa04 · b9e12416 · b9e12416
Commit b9e12416 authored May 31, 2024 by zhuwenwen
20 changed files
--- a/tests/models/__init__.py
+++ b/tests/models/__init__.py
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -8,11 +8,11 @@ import pytest

 MODELS = [
    "meta-llama/Llama-2-7b-hf",
-    # "mistralai/Mistral-7B-v0.1",  # Broken
+    # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
    # "Deci/DeciLM-7b",  # Broken
    # "tiiuae/falcon-7b",  # Broken
    "EleutherAI/gpt-j-6b",
-    "mosaicml/mpt-7b",
+    # "mosaicml/mpt-7b",  # Broken
    # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]


--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
+"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
+
+Run `pytest tests/models/test_llama_embedding.py`.
+"""
+import pytest
+import torch
+import torch.nn.functional as F
+
+MODELS = [
+    "intfloat/e5-mistral-7b-instruct",
+]
+
+
+def compare_embeddings(embeddings1, embeddings2):
+    similarities = [
+        F.cosine_similarity(torch.tensor(e1), torch.tensor(e2), dim=0)
+        for e1, e2 in zip(embeddings1, embeddings2)
+    ]
+    return similarities
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.encode(example_prompts)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.encode(example_prompts)
+    del vllm_model
+
+    similarities = compare_embeddings(hf_outputs, vllm_outputs)
+    all_similarities = torch.stack(similarities)
+    tolerance = 1e-2
+    assert torch.all((all_similarities <= 1.0 + tolerance)
+                     & (all_similarities >= 1.0 - tolerance)
+                     ), f"Not all values are within {tolerance} of 1.0"
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -16,31 +16,55 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024

 MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV",
    "meta-llama/Meta-Llama-3-8B-Instruct",
 ]

 EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
-        'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-        'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** (Haya tori, nemuri nemuri)\n\n**'
-    ],
-    "meta-llama/Meta-Llama-3-8B-Instruct": [
-        'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-        'In the year 2154, the robotics lab at NeuroSpark Industries was on the cusp of',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
-    ],
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": {
+        "auto": [
+            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both',
+            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no'
+        ],
+        "fp8": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system made up of several basic components that work together to enable it to',
+            'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk'
+        ]
+    },
+    "meta-llama/Meta-Llama-3-8B-Instruct": {
+        "auto": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+            'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
+        ],
+        "fp8": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+            'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu'
+        ]
+    },
 }

 capability = torch.cuda.get_device_capability()
@@ -52,14 +76,14 @@ fp8_not_supported = (capability <
 @pytest.mark.skipif(fp8_not_supported,
                    reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
-def test_models(
-    example_prompts,
-    model_name,
-) -> None:
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
    model = LLM(model=model_name,
                max_model_len=MAX_MODEL_LEN,
+                trust_remote_code=True,
                enforce_eager=True,
-                quantization="fp8")
+                quantization="fp8",
+                kv_cache_dtype=kv_cache_dtype)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
@@ -81,8 +105,8 @@ def test_models(
        generations.append(outputs[0].outputs[0].text)
    del model

-    print(generations)
-    expected_strs = EXPECTED_STRS_MAP[model_name]
+    print(model_name, kv_cache_dtype, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
    for i in range(len(example_prompts)):
        generated_str = generations[i]
        expected_str = expected_strs[i]

--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
 """Compares the outputs of gptq vs gptq_marlin 
 Note: GPTQ and Marlin do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
-Marlin/GPTQ models are in the top 3 selections of each other.
+Marlin/GPTQ models are in the top 5 selections of each other.
 Note: Marlin internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
-Note: This test currently fails running with --forked with the following:
-    RuntimeError: Cannot re-initialize CUDA in forked subprocess.
-    To use CUDA with multiprocessing, you must use the 'spawn' start method
+
 Run `pytest tests/models/test_gptq_marlin.py`.
 """
 import os
@@ -15,8 +13,10 @@ import os
 import pytest
 import torch

-from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
+
+from .utils import check_logprobs_close

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

@@ -49,11 +49,11 @@ MODELS = [
 ]


-@pytest.mark.flaky(reruns=2)
+@pytest.mark.flaky(reruns=3)
 @pytest.mark.skipif(gptq_marlin_not_supported,
                    reason="gptq_marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
@@ -75,17 +75,21 @@ def test_models(
                                    tensor_parallel_size=1)

    gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
+        example_prompts[:-1], max_tokens, num_logprobs)
    del gptq_marlin_model
+    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error

    # Run gptq.
+    # The naive gptq kernel doesn't support bf16 yet.
+    # Here we always compare fp16/bf16 gpt marlin kernel
+    # to fp16 gptq kernel.
    gptq_model = vllm_runner(model_name=model_name,
                             revision=revision,
-                             dtype=dtype,
+                             dtype="half",
                             quantization="gptq",
                             max_model_len=MAX_MODEL_LEN,
                             tensor_parallel_size=1)
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
+    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1],
                                                       max_tokens,
                                                       num_logprobs)
    del gptq_model

--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
+"""Compare the outputs of a GPTQ model to a Marlin_24 model.
+
+Note: GPTQ and Marlin_24 do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 3 selections of each other.
+
+Run `pytest tests/models/test_marlin_24.py`.
+"""
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from tests.models.utils import check_logprobs_close
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+marlin_not_supported = (capability <
+                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+
+
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+
+
+model_pairs = [
+    # 4-bit, group_size == 128
+    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
+              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
+    # 4-bit, group_size == channelwise
+    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
+              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+
+    # 8-bit, group_size == 128
+    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
+              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
+    # 8-bit, group_size == channelwise
+    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
+              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(marlin_not_supported,
+                    reason="Marlin24 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    marlin_24_model = vllm_runner(model_pair.model_marlin,
+                                  dtype=dtype,
+                                  quantization="gptq_marlin_24")
+    marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+    del marlin_24_model
+
+    gptq_model = vllm_runner(model_pair.model_gptq,
+                             dtype=dtype,
+                             quantization="gptq")
+    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+    del gptq_model
+
+    check_logprobs_close(
+        outputs_0_lst=gptq_outputs,
+        outputs_1_lst=marlin_24_outputs,
+        name_0="gptq",
+        name_1="marlin_24",
+    )
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -15,9 +15,10 @@ from dataclasses import dataclass
 import pytest
 import torch

-from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS

+from .utils import check_logprobs_close
+
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 marlin_not_supported = (capability <

--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -4,37 +4,41 @@ Run `pytest tests/models/test_mistral.py`.
 """
 import pytest

+from .utils import check_logprobs_close
+
 MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
 ]


 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.skip(
-    "Two problems: 1. Failing correctness tests. 2. RuntimeError: expected "
-    "scalar type BFloat16 but found Half (only in CI).")
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
    hf_runner,
    vllm_runner,
-    example_long_prompts,
+    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
+    num_logprobs: int,
 ) -> None:
+    # TODO(sang): Sliding window should be tested separately.
    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_long_prompts, max_tokens)
+    hf_outputs = hf_model.generate_greedy_logprobs_limit(
+        example_prompts, max_tokens, num_logprobs)
    del hf_model

    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy(example_long_prompts, max_tokens)
+    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
    del vllm_model
-
-    for i in range(len(example_long_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
+import pytest
+
+from vllm.model_executor.models import _MODELS, ModelRegistry
+
+
+@pytest.mark.parametrize("model_cls", _MODELS)
+def test_registry_imports(model_cls):
+    # Ensure all model classes can be imported successfully
+    ModelRegistry.load_model_cls(model_cls)
--- a/tests/prefix_caching/__init__.py
+++ b/tests/prefix_caching/__init__.py
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
+"""Compare the with and without prefix caching.
+
+Run `pytest tests/prefix_caching/test_prefix_caching.py`.
+"""
+import pytest
+
+from tests.conftest import cleanup
+from vllm import LLM
+
+MODEL_LEN_LEN = [
+    # Example models with sliding window.
+    ("bigcode/starcoder2-3b", 4096, 16384),
+    # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
+
+    # Confirm model with sliding window works.
+    # config has "use_sliding_window": false
+    ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
+    # config has no sliding window attribute.
+    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
+]
+
+
+@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
+def test_disable_sliding_window(model_len_len, ):
+    model, sliding_len, full_len = model_len_len
+    vllm_disabled_model = LLM(model, disable_sliding_window=True)
+    vllm_disabled_model.generate("Hi my name is")
+    model_config = vllm_disabled_model.llm_engine.model_config
+    assert model_config.max_model_len == sliding_len, (
+        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
+        model_config.max_model_len)
+
+    del vllm_disabled_model
+    cleanup()
+
+    vllm_enabled_model = LLM(model, disable_sliding_window=False)
+    vllm_enabled_model.generate("Hi my name is")
+    model_config = vllm_enabled_model.llm_engine.model_config
+    assert model_config.max_model_len == full_len, (
+        "Max len expected to equal full_len of %s, but got %s", full_len,
+        model_config.max_model_len)
+
+    del vllm_enabled_model
+    cleanup()
--- a/tests/quantization/__init__.py
+++ b/tests/quantization/__init__.py
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
+"""Test model set-up and weight loading for sparseml-quantized models.
+
+Run `pytest tests/quantization/test_compressed_tensors.py`.
+"""
+
+import torch
+
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor)
+
+
+def test_compressed_tensors_w8a8_static_setup(vllm_runner):
+    model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
+    llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True)
+    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+    layer = model.model.layers[0]
+
+    qkv_proj = layer.self_attn.qkv_proj
+    o_proj = layer.self_attn.o_proj
+    gate_up_proj = layer.mlp.gate_up_proj
+    down_proj = layer.mlp.down_proj
+
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
+
+    assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
+
+    assert qkv_proj.weight.dtype is torch.int8
+    assert o_proj.weight.dtype is torch.int8
+    assert gate_up_proj.weight.dtype is torch.int8
+
+    assert qkv_proj.weight_scale.shard_splitter is not None
+    assert qkv_proj.weight_scale.logical_widths is not None
+    assert qkv_proj.input_scale.dtype is torch.float32
--- a/tests/samplers/__init__.py
+++ b/tests/samplers/__init__.py
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -35,28 +35,25 @@ def test_logits_processor_force_generate(

    # test logits_processors when prompt_logprobs is not None
    vllm_model.model._add_request(
-        prompt=example_prompts[0],
-        sampling_params=params_with_logprobs,
-        prompt_token_ids=None,
+        example_prompts[0],
+        params=params_with_logprobs,
    )

    # test prompt_logprobs is not None
    vllm_model.model._add_request(
-        prompt=example_prompts[1],
-        sampling_params=SamplingParams(
+        example_prompts[1],
+        params=SamplingParams(
            prompt_logprobs=3,
            max_tokens=max_tokens,
        ),
-        prompt_token_ids=None,
    )

    # test grouped requests
    vllm_model.model._add_request(
-        prompt=example_prompts[2],
-        sampling_params=SamplingParams(max_tokens=max_tokens),
-        prompt_token_ids=None,
+        example_prompts[2],
+        params=SamplingParams(max_tokens=max_tokens),
    )

-    outputs = vllm_model.model._run_engine(False)
+    outputs = vllm_model.model._run_engine(use_tqdm=False)

    assert outputs[0].outputs[0].text == enforced_answers * repeat_times
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
 import pytest
 import torch

-from tests.conftest import VllmRunner
 from vllm import SamplingParams

+from ..conftest import VllmRunner
+
 MODELS = ["facebook/opt-125m"]



--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -42,9 +42,11 @@ def mock_causal_accepted_tensor(
 @pytest.mark.parametrize(
    "which_tokens_accepted",
    ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_correct_output_format(which_tokens_accepted: str, seed: int,
+def test_correct_output_format(which_tokens_accepted: str,
+                               disable_bonus_tokens: bool, seed: int,
                               device: str):
    """Verify the output has correct format given predetermined accepted matrix.
    """
@@ -82,7 +84,8 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
                                    size=(batch_size, 1),
                                    dtype=torch.int64)

-    rejection_sampler = RejectionSampler()
+    rejection_sampler = RejectionSampler(
+        disable_bonus_tokens=disable_bonus_tokens)
    rejection_sampler.init_gpu_tensors(rank=0)
    output_token_ids = rejection_sampler._create_output(  # pylint: disable=protected-access
        accepted,
@@ -91,9 +94,11 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
        bonus_token_ids,
    )

-    # Bonus tokens are currently disabled. Verify they're set to -1.
+    expected_bonus_token_ids = bonus_token_ids.clone()
+    # If bonus tokens disabled. Verify they are set to -1.
    # See https://github.com/vllm-project/vllm/issues/4212
-    expected_bonus_token_ids = bonus_token_ids.clone() * 0 - 1
+    if disable_bonus_tokens:
+        expected_bonus_token_ids = expected_bonus_token_ids * 0 - 1

    if which_tokens_accepted == "all_tokens_accepted":
        # Expect all tokens to be equal to draft tokens.

--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -11,8 +11,7 @@ from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import Counter
-from vllm.worker.model_runner import ModelRunner
+from vllm.utils import Counter, is_pin_memory_available


 class MockLogitsSampler(Sampler):
@@ -26,20 +25,14 @@ class MockLogitsSampler(Sampler):


 def _prepare_test(
-    batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
+        batch_size: int
+) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
    fake_logits = torch.full((batch_size, VOCAB_SIZE),
                             1e-2,
                             dtype=input_tensor.dtype)
    sampler = MockLogitsSampler(fake_logits)
-    model_runner = ModelRunner(model_config=None,
-                               parallel_config=None,
-                               scheduler_config=None,
-                               device_config=None,
-                               load_config=None,
-                               lora_config=None)
-    return input_tensor, fake_logits, sampler, model_runner
+    return input_tensor, fake_logits, sampler


 VOCAB_SIZE = 32000
@@ -53,7 +46,6 @@ def _do_sample(
    batch_size: int,
    input_tensor: torch.Tensor,
    sampler: MockLogitsSampler,
-    model_runner: ModelRunner,
    sampling_params: SamplingParams,
    device: str,
 ):
@@ -75,7 +67,7 @@ def _do_sample(
        seq_lens,
        query_lens=seq_lens,
        device=device,
-        pin_memory=model_runner.pin_memory)
+        pin_memory=is_pin_memory_available())
    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)


@@ -85,19 +77,16 @@ def test_sampler_all_greedy(seed: int, device: str):
    set_random_seed(seed)
    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
-        batch_size)
+    input_tensor, fake_logits, sampler = _prepare_test(batch_size)

    sampling_params = SamplingParams(temperature=0)
-    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
                                sampling_params, device)
    expected = torch.argmax(fake_logits, dim=-1)
    for i, sequence_output in enumerate(sampler_output):
        for nth_output in sequence_output.samples:
            assert nth_output.output_token == expected[i].item()

-    del model_runner
-

 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -105,8 +94,7 @@ def test_sampler_all_random(seed: int, device: str):
    set_random_seed(seed)
    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
-        batch_size)
+    _, fake_logits, sampler = _prepare_test(batch_size)

    for i in range(batch_size):
        fake_logits[i, i] = 1e2
@@ -115,15 +103,13 @@ def test_sampler_all_random(seed: int, device: str):
        temperature=1.0,
        n=random.randint(1, 10),
    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
                                sampling_params, device)

    for i, sequence_output in enumerate(sampler_output):
        for nth_output in sequence_output.samples:
            assert nth_output.output_token == i

-    del model_runner
-

 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -131,7 +117,7 @@ def test_sampler_all_random_seed(seed: int, device: str):
    set_random_seed(seed)
    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+    _, fake_logits, sampler = _prepare_test(batch_size)

    for i in range(batch_size):
        fake_logits[i, i] = 1e2
@@ -141,15 +127,13 @@ def test_sampler_all_random_seed(seed: int, device: str):
        n=random.randint(1, 10),
        seed=random.randint(0, 10000),
    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler, model_runner,
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
                                sampling_params, device)

    for i, sequence_output in enumerate(sampler_output):
        for nth_output in sequence_output.samples:
            assert nth_output.output_token == i

-    del model_runner
-

 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -157,7 +141,7 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
    set_random_seed(seed)
    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+    _, fake_logits, sampler = _prepare_test(batch_size)

    sampling_params = SamplingParams(
        temperature=1.0,
@@ -165,15 +149,13 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str):
        seed=random.randint(0, 10000),
    )
    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                      model_runner, sampling_params, device)
+                                      sampling_params, device)

    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                       model_runner, sampling_params, device)
+                                       sampling_params, device)

    assert first_sampler_output == second_sampler_output

-    del model_runner
-

 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -181,20 +163,18 @@ def test_sampler_all_beam(seed: int, device: str):
    set_random_seed(seed)
    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+    _, fake_logits, sampler = _prepare_test(batch_size)

    sampling_params = SamplingParams(
        temperature=0,
        best_of=2,
        use_beam_search=True,
    )
-    _do_sample(batch_size, fake_logits, sampler, model_runner, sampling_params,
-               device)
+    _do_sample(batch_size, fake_logits, sampler, sampling_params, device)
    # no assertion here as I am not sure how to determine whether
    # the outputs are expected - in other words, this just tests
    # whether there are no exceptions in the sampler
    # when handling an all-beam search case.
-    del model_runner


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
@@ -448,13 +428,13 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
            ("Invalid test case, expected_penalization does not match computed"
             "batch size")

-        _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
+        _, fake_logits, sampler = _prepare_test(batch_size)
        sampling_metadata = SamplingMetadata.prepare(
            seq_group_metadata_list,
            seq_lens=seq_lens if seq_lens else None,
            query_lens=seq_lens if seq_lens else None,
            device=device,
-            pin_memory=model_runner.pin_memory)
+            pin_memory=is_pin_memory_available())
        # the logits tensor is modified in-place by the sampler
        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)

@@ -480,8 +460,6 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
                    fake_logits[logits_idx, :] ==
                    -float('inf')) == 0, "No tokens should have been penalized"

-        del model_runner
-
    for test_case in test_cases:
        run_test_case(**test_case)

@@ -492,8 +470,7 @@ def test_sampler_mixed(seed: int, device: str):
    set_random_seed(seed)
    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
-        batch_size)
+    input_tensor, fake_logits, sampler = _prepare_test(batch_size)

    seq_group_metadata_list = []
    expected_tokens: List[Optional[List[int]]] = []
@@ -534,13 +511,13 @@ def test_sampler_mixed(seed: int, device: str):
            ))
        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

-    def test_sampling(model_runner: ModelRunner):
+    def test_sampling():
        sampling_metadata = SamplingMetadata.prepare(
            seq_group_metadata_list,
            seq_lens,
            query_lens=seq_lens,
            device=device,
-            pin_memory=model_runner.pin_memory)
+            pin_memory=is_pin_memory_available())
        sampler_output = sampler(logits=fake_logits,
                                 sampling_metadata=sampling_metadata)

@@ -570,7 +547,7 @@ def test_sampler_mixed(seed: int, device: str):
                    assert nth_output.output_token in expected_tokens[i]

    # Test batch
-    test_sampling(model_runner)
+    test_sampling()

    # Shuffle the batch and resample
    target_index = list(range(batch_size))
@@ -583,9 +560,7 @@ def test_sampler_mixed(seed: int, device: str):

    # This time, results of seeded random samples will be compared with
    # the corresponding sample in the pre-shuffled batch
-    test_sampling(model_runner)
-
-    del model_runner
+    test_sampling()


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
@@ -605,12 +580,6 @@ def test_sampler_top_k_top_p(seed: int, device: str):
                               device=input_tensor.device,
                               dtype=input_tensor.dtype)
    sampler = MockLogitsSampler(fake_logits)
-    model_runner = ModelRunner(model_config=None,
-                               parallel_config=None,
-                               scheduler_config=None,
-                               device_config=None,
-                               load_config=None,
-                               lora_config=None)

    generation_model = GenerationMixin()
    generation_config = GenerationConfig(top_k=top_k,
@@ -641,7 +610,7 @@ def test_sampler_top_k_top_p(seed: int, device: str):
        seq_lens,
        query_lens=seq_lens,
        device=device,
-        pin_memory=model_runner.pin_memory)
+        pin_memory=is_pin_memory_available())

    sample_probs = None

@@ -657,5 +626,3 @@ def test_sampler_top_k_top_p(seed: int, device: str):
    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
    assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
-
-    del model_runner
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -57,11 +57,7 @@ def test_random_sample_with_seed(
                sampling_params_seed_1,
                sampling_params_seed_2,
        ):
-            llm._add_request(
-                prompt=prompt,
-                prompt_token_ids=None,
-                sampling_params=params,
-            )
+            llm._add_request(prompt, params=params)

    results = llm._run_engine(use_tqdm=False)
    all_outputs = [[out.token_ids for out in output.outputs]

--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -6,10 +6,13 @@ from typing import Dict, List, Optional, Tuple, Union
 import pytest
 import ray
 import torch
-from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                    nvmlInit)

-from tests.conftest import cleanup
+from vllm.utils import is_hip
+
+if (not is_hip()):
+    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
+                        nvmlInit)
+
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -21,6 +24,8 @@ from vllm.sequence import Logprob, MultiModalData
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, random_uuid

+from ...conftest import cleanup
+

 class AsyncLLM:
    """AsyncLLM
@@ -55,7 +60,7 @@ class AsyncLLM:
    ) -> None:
        if "disable_log_stats" not in kwargs:
            kwargs["disable_log_stats"] = True
-        self.engine_args = AsyncEngineArgs(
+        engine_args = AsyncEngineArgs(
            model=model,
            tokenizer=tokenizer,
            tokenizer_mode=tokenizer_mode,
@@ -76,6 +81,8 @@ class AsyncLLM:
            **kwargs,
        )
        self.request_counter = Counter()
+        self.llm_engine = AsyncLLMEngine.from_engine_args(
+            engine_args, usage_context=UsageContext.LLM_CLASS)

    def generate(
        self,
@@ -88,9 +95,6 @@ class AsyncLLM:
        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[RequestOutput]:

-        llm_engine = AsyncLLMEngine.from_engine_args(
-            self.engine_args, usage_context=UsageContext.LLM_CLASS)
-
        if prompts is None:
            raise ValueError("prompts must be provided.")
        if isinstance(prompts, str):
@@ -111,8 +115,8 @@ class AsyncLLM:

        async def get_output(prompt, sampling_param) -> str:
            request_id = random_uuid()
-            results_generator = llm_engine.generate(prompt, sampling_param,
-                                                    request_id)
+            results_generator = self.llm_engine.generate(
+                prompt, sampling_param, request_id)
            final_output = None
            async for request_output in results_generator:
                final_output = request_output
@@ -185,12 +189,25 @@ def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
    return generator_outer


+def maybe_assert_ngram_worker(llm):
+    # Verify the proposer worker is ngram if ngram is specified.
+    if (not isinstance(llm, AsyncLLM)
+            and llm.llm_engine.speculative_config is not None
+            and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
+        from vllm.spec_decode.ngram_worker import NGramWorker
+        assert isinstance(
+            llm.llm_engine.model_executor.driver_worker.proposer_worker,
+            NGramWorker)
+
+
 def get_output_from_llm_generator(
        llm_generator, prompts,
        sampling_params) -> Tuple[List[str], List[List[int]]]:
    tokens = []
    token_ids = []
    for llm in llm_generator():
+        maybe_assert_ngram_worker(llm)
+
        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
        token_ids = [output.outputs[0].token_ids for output in outputs]
        tokens = [output.outputs[0].text for output in outputs]