merge v0.4.1

99b471c2 · zhuwenwen · 1925d2e9 · 468d761b · 99b471c2 · 99b471c2
Commit 99b471c2 authored May 21, 2024 by zhuwenwen
20 changed files
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -10,7 +10,7 @@ from vllm.attention.ops.prefix_prefill import context_attention_fwd
 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 8, 64]
-HEAD_SIZES = [128]
+HEAD_SIZES = [128, 96]
 DTYPES = [torch.float16]
 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -12,6 +12,7 @@ from huggingface_hub import snapshot_download
 import vllm
 from vllm.config import LoRAConfig
+from vllm.distributed import destroy_model_parallel, initialize_model_parallel
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               MergedColumnParallelLinear,
                                               RowParallelLinear)
@@ -19,8 +20,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.parallel_utils.parallel_state import (
-    destroy_model_parallel, initialize_model_parallel)
 def cleanup():
@@ -144,16 +143,27 @@ def baichuan_lora_files():
    return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
+@pytest.fixture(scope="session")
+def baichuan_zero_lora_files():
+    # all the lora_B weights are initialized to zero.
+    return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
+@pytest.fixture(scope="session")
+def tinyllama_lora_files():
+    return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings() -> nn.Module:
    cleanup()
    get_model_old = get_model
-    def get_model_patched(model_config, device_config, **kwargs):
+    def get_model_patched(*, model_config, device_config, **kwargs):
-        return get_model_old(model_config,
+        kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
-                             device_config,
+        return get_model_old(model_config=model_config,
-                             lora_config=LoRAConfig(max_loras=4,
+                             device_config=device_config,
-                                                    max_lora_rank=8))
+                             **kwargs)
    with patch("vllm.worker.model_runner.get_model", get_model_patched):
        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)

--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -62,7 +62,7 @@ def test_baichuan_lora(baichuan_lora_files):
 @pytest.mark.skip("Requires multiple GPUs")
-def test_llama_tensor_parallel_equality(baichuan_lora_files):
+def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
    # Cannot use as it will initialize torch.cuda too early...
    # if torch.cuda.device_count() < 4:
    #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -170,7 +170,8 @@ def create_random_inputs(
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_embeddings(dist_init, num_loras, device) -> None:
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+def test_embeddings(dist_init, num_loras, device, vocab_size) -> None:
    torch.set_default_device(device)
    max_loras = 8
@@ -179,9 +180,9 @@ def test_embeddings(dist_init, num_loras, device) -> None:
                             lora_dtype=torch.float16)
    def create_random_embedding_layer():
-        embedding = VocabParallelEmbedding(512, 256)
+        embedding = VocabParallelEmbedding(vocab_size, 256)
        embedding.weight.data = torch.rand_like(embedding.weight.data)
-        embedding.weight.data[512:, :] = 0
+        embedding.weight.data[vocab_size:, :] = 0
        lora_embedding = VocabParallelEmbeddingWithLoRA(embedding)
        lora_embedding.create_lora_weights(max_loras, lora_config)
@@ -203,12 +204,13 @@ def test_embeddings(dist_init, num_loras, device) -> None:
            active_lora_ids=list(lora_dict.keys()),
            num_inputs=num_loras * 3,
            input_size=(200, ),
-            input_range=(1, 512),
+            input_range=(1, vocab_size),
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       512, lora_config.lora_extra_vocab_size)
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
        lora_embedding.set_mapping(*mapping_info)
        lora_result = lora_embedding(torch.cat(inputs))
@@ -240,12 +242,13 @@ def test_embeddings(dist_init, num_loras, device) -> None:
            active_lora_ids=[0],
            num_inputs=num_loras * 3,
            input_size=(200, ),
-            input_range=(1, 512),
+            input_range=(1, vocab_size),
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       512, lora_config.lora_extra_vocab_size)
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
        lora_embedding.set_mapping(*mapping_info, )
        lora_result = lora_embedding(torch.cat(inputs))
@@ -263,7 +266,9 @@ def test_embeddings(dist_init, num_loras, device) -> None:
 #     reason="Fails when loras are in any slot other than the first.")
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
+                                        vocab_size) -> None:
    torch.set_default_device(device)
    max_loras = 8
@@ -272,15 +277,15 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
                             lora_dtype=torch.float16)
    def create_random_embedding_layer():
-        embedding = VocabParallelEmbedding(512, 256)
+        embedding = VocabParallelEmbedding(vocab_size, 256)
        embedding_data = torch.rand_like(embedding.weight.data)
        embedding.weight.data = embedding_data
-        embedding.weight.data[512:, :] = 0
+        embedding.weight.data[vocab_size:, :] = 0
        expanded_embedding = VocabParallelEmbedding(
-            512 + lora_config.lora_extra_vocab_size * max_loras,
+            vocab_size + lora_config.lora_extra_vocab_size * max_loras,
            256,
-            org_num_embeddings=512)
+            org_num_embeddings=vocab_size)
-        expanded_embedding.weight.data[:512, :] = embedding_data
+        expanded_embedding.weight.data[:vocab_size, :] = embedding_data
        # We need to deepcopy the embedding as it will be modified
        # in place
        lora_embedding = VocabParallelEmbeddingWithLoRA(
@@ -298,7 +303,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
            id_to_index,
            layer=lora_embedding,
            layer_weights=torch.zeros(
-                (256, 512 + lora_config.lora_extra_vocab_size)),
+                (256, vocab_size + lora_config.lora_extra_vocab_size)),
            generate_embeddings_tensor=256,
        )
@@ -316,7 +321,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
            active_lora_ids=list(lora_dict.keys()),
            num_inputs=num_loras * 3,
            input_size=(200, ),
-            input_range=(1, 512),
+            input_range=(1, vocab_size),
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -327,16 +332,18 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
        for input_, original_input_, lora_id in zip(inputs, original_inputs,
                                                    prompt_mapping):
            embedding_id = lora_id - 1
-            input_[-1] = 512 + (embedding_id * embeddings_tensor_len)
+            input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
-            original_input_[-1] = 512
+            original_input_[-1] = vocab_size
-            input_[-2] = 512 + ((embedding_id + 1) * embeddings_tensor_len - 1)
+            input_[-2] = vocab_size + (
-            original_input_[-2] = 512 + embeddings_tensor_len - 1
+                (embedding_id + 1) * embeddings_tensor_len - 1)
+            original_input_[-2] = vocab_size + embeddings_tensor_len - 1
        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       512, lora_config.lora_extra_vocab_size)
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
        lora_embedding.set_mapping(*mapping_info, )
-        expanded_embedding.weight[512:512 +
+        expanded_embedding.weight[vocab_size:vocab_size +
                                  (embeddings_tensor_len *
                                   max_loras)] = torch.cat(embeddings_tensors)
@@ -370,14 +377,15 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
            active_lora_ids=[0],
            num_inputs=num_loras * 3,
            input_size=(200, ),
-            input_range=(1, 512),
+            input_range=(1, vocab_size),
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
        original_inputs = deepcopy(inputs)
        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       512, lora_config.lora_extra_vocab_size)
+                                       vocab_size,
+                                       lora_config.lora_extra_vocab_size)
        lora_embedding.set_mapping(*mapping_info, )
        lora_result = lora_embedding(torch.cat(original_inputs))
@@ -393,7 +401,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
+@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
+def test_lm_head_logits_processor(dist_init, num_loras, device,
+                                  vocab_size) -> None:
    torch.set_default_device(device)
    max_loras = 8
@@ -402,12 +412,14 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
                             lora_dtype=torch.float16)
    def _pretest():
-        linear = ParallelLMHead(32000 + lora_config.lora_extra_vocab_size,
+        linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
-                                1024, 32000)
+                                1024,
+                                vocab_size,
+                                params_dtype=torch.float16)
        linear.weight.data = torch.rand_like(linear.weight.data)
-        linear.weight.data[:, 32000:] = 0
+        linear.weight.data[:, vocab_size:] = 0
        logits_processor = LogitsProcessor(
-            32000 + lora_config.lora_extra_vocab_size, 32000)
+            vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
        lora_logits_processor = LogitsProcessorWithLoRA(
            logits_processor, 1024, linear.weight.dtype, linear.weight.device)
        lora_logits_processor.create_lora_weights(max_loras, lora_config)
@@ -435,7 +447,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
            num_inputs=8 * num_loras,  # * 3,
            input_size=(1, 1024),
            input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -444,7 +456,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
            lora_mapping,
            id_to_index,
            max_loras,
-            32000,
+            vocab_size,
            lora_config.lora_extra_vocab_size,
        )
        lora_logits_processor.set_mapping(*mapping_info, )
@@ -460,7 +472,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
                      org_vocab_size:logits_processor.org_vocab_size +
                      embeddings_tensor_len] = embeddings_tensor
-        logits_processor.org_vocab_size = (32000 +
+        logits_processor.org_vocab_size = (vocab_size +
                                           lora_config.lora_extra_vocab_size)
        expected_results = []
        for input_, lora_id in zip(inputs, prompt_mapping):
@@ -468,11 +480,11 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
            result = logits_processor._get_logits(hidden_states=input_,
                                                  embedding=linear.weight,
                                                  embedding_bias=None)
-            result[:, 32000 + embeddings_tensor_len:] = float("-inf")
+            result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
            expected_results.append(result)
        expected_result = torch.cat(expected_results)
-        logits_processor.org_vocab_size = 32000
+        logits_processor.org_vocab_size = vocab_size
        # Check that resetting the lora weights succeeds
@@ -484,19 +496,19 @@ def test_lm_head_logits_processor(dist_init, num_loras, device) -> None:
            num_inputs=8 * num_loras * 3,
            input_size=(1, 1024),
            input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
-                                       32000,
+                                       vocab_size,
                                       lora_config.lora_extra_vocab_size)
        lora_logits_processor.set_mapping(*mapping_info, )
        lora_result = lora_logits_processor._get_logits(
            hidden_states=torch.cat(inputs),
            embedding=original_weight,
-            embedding_bias=None)[:, :32000]
+            embedding_bias=None)[:, :vocab_size]
        expected_result = logits_processor._get_logits(
            hidden_states=torch.cat(inputs),
            embedding=original_weight,
@@ -523,11 +535,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
    def create_random_linear_parallel_layer():
        if orientation == "row":
-            linear = RowParallelLinear(4096, 4096, bias=False)
+            linear = RowParallelLinear(4096,
+                                       4096,
+                                       bias=False,
+                                       params_dtype=torch.float16)
            linear.weight.data = torch.rand_like(linear.weight.data)
            lora_linear = RowParallelLinearWithLoRA(linear)
        else:
-            linear = ColumnParallelLinear(4096, 4096, bias=False)
+            linear = ColumnParallelLinear(4096,
+                                          4096,
+                                          bias=False,
+                                          params_dtype=torch.float16)
            linear.weight.data = torch.rand_like(linear.weight.data)
            lora_linear = ColumnParallelLinearWithLoRA(linear)
        lora_linear.create_lora_weights(max_loras, lora_config)
@@ -551,7 +569,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
            num_inputs=32 * num_loras,
            input_size=(1, 4096),
            input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -590,7 +608,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
            num_inputs=32 * num_loras,
            input_size=(1, 4096),
            input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -623,15 +641,24 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
    def create_column_parallel_packed_layer():
        if repeats == 2:
            linear = MergedColumnParallelLinear(4096, [4096] * repeats,
-                                                bias=False)
+                                                bias=False,
+                                                params_dtype=torch.float16)
            linear.weight.data = torch.rand_like(linear.weight.data)
            lora_linear = MergedColumnParallelLinearWithLoRA(linear)
        elif repeats == 3:
-            linear = QKVParallelLinear(4096, 64, 32, bias=False)
+            linear = QKVParallelLinear(4096,
+                                       64,
+                                       32,
+                                       bias=False,
+                                       params_dtype=torch.float16)
            linear.weight.data = torch.rand_like(linear.weight.data)
            lora_linear = MergedQKVParallelLinearWithLora(linear)
        else:
-            linear = QKVParallelLinear(4096, 64, 32, bias=False)
+            linear = QKVParallelLinear(4096,
+                                       64,
+                                       32,
+                                       bias=False,
+                                       params_dtype=torch.float16)
            linear.weight.data = torch.rand_like(linear.weight.data)
            lora_linear = QKVParallelLinearWithLora(linear)
@@ -666,7 +693,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
            num_inputs=32 * num_loras,
            input_size=(1, 4096),
            input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -706,7 +733,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
            num_inputs=32 * num_loras,
            input_size=(1, 4096),
            input_range=(0, 1),
-            input_type=torch.float32,
+            input_type=torch.float16,
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)

--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
+import pytest
+from vllm.lora.models import LoRAModel
+from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
+@pytest.mark.parametrize("lora_name", lora_lst)
+def test_load_checkpoints(
+    lora_name,
+    baichuan_lora_files,
+    baichuan_zero_lora_files,
+    chatglm3_lora_files,
+):
+    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules = []
+    for module in supported_lora_modules:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+    if lora_name == "baichuan7B":
+        # For the baichuan7B model, load it's LoRA,
+        # and the test should pass.
+        LoRAModel.from_local_checkpoint(
+            baichuan_lora_files,
+            expected_lora_modules,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    elif lora_name == "baichuan7B-zero":
+        #Test that the target_modules contain prefix
+        # such as "model.layers.0.self_atten.W_pack", and
+        # the test should pass.
+        LoRAModel.from_local_checkpoint(
+            baichuan_zero_lora_files,
+            expected_lora_modules,
+            lora_model_id=1,
+            device="cpu",
+            embedding_modules=embedding_modules,
+            embedding_padding_modules=embed_padding_modules)
+    else:
+        # For the baichuan7B model, load chatglm3-6b's LoRA,
+        # and the test should raise the following error.
+        expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
+        with pytest.raises(ValueError, match=expected_error):
+            LoRAModel.from_local_checkpoint(
+                chatglm3_lora_files,
+                expected_lora_modules,
+                lora_model_id=1,
+                device="cpu",
+                embedding_modules=embedding_modules,
+                embedding_padding_modules=embed_padding_modules)
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -43,10 +43,53 @@ def _lora_ref_impl(
 H1 = H2 = [
-    128, 256, 512, 1024, 1152, 1280, 1536, 2048, 2304, 2560, 2752, 3072, 3456,
+    128,
-    3584, 4096, 4608, 5120, 5504, 5632, 6144, 6848, 6912, 7168, 8192, 9216,
+    256,
-    10240, 11008, 13824, 14336, 22016, 24576, 27392, 32000, 32256, 32512,
+    512,
-    32768, 33024
+    1024,
+    1152,
+    1280,
+    1536,
+    2048,
+    2304,
+    2560,
+    2752,
+    3072,
+    3456,
+    3584,
+    4096,
+    4608,
+    5120,
+    5504,
+    5632,
+    6144,
+    6848,
+    6912,
+    7168,
+    8192,
+    9216,
+    10240,
+    11008,
+    13824,
+    14336,
+    15360,
+    22016,
+    24576,
+    27392,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
 ]
 SEED = [0xabcdabcd987]
 CUDA_DEVICES = [

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
+# Adapted from
+# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
+from dataclasses import dataclass
+from typing import List
+import pytest
+import vllm
+from vllm.lora.request import LoRARequest
+from .conftest import cleanup
+@dataclass
+class ModelWithQuantization:
+    model_path: str
+    quantization: str
+MODELS: List[ModelWithQuantization] = [
+    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+                          quantization="AWQ"),
+    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+                          quantization="GPTQ"),
+]
+def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256):
+    raw_prompts = [
+        "Give me an orange-ish brown color",
+        "Give me a neon pink color",
+    ]
+    def format_prompt_tuples(prompt):
+        return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+    prompts = [format_prompt_tuples(p) for p in raw_prompts]
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=max_tokens,
+                                          stop=["<|im_end|>"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("tp_size", [1])
+def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < tp_size:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+    llm = vllm.LLM(model=model.model_path,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   max_model_len=400,
+                   tensor_parallel_size=tp_size,
+                   quantization=model.quantization,
+                   trust_remote_code=True)
+    if model.quantization is None:
+        expected_no_lora_output = [
+            "Here are some examples of orange-brown colors",
+            "I'm sorry, I don't have"
+        ]
+        expected_lora_output = [
+            "#ff8050",
+            "#ff8080",
+        ]
+    elif model.quantization == "AWQ":
+        expected_no_lora_output = [
+            "I'm sorry, I don't understand",
+            "I'm sorry, I don't understand",
+        ]
+        expected_lora_output = [
+            "#f07700: A v",
+            "#f00000: A v",
+        ]
+    elif model.quantization == "GPTQ":
+        expected_no_lora_output = [
+            "I'm sorry, I don't have",
+            "I'm sorry, I don't have",
+        ]
+        expected_lora_output = [
+            "#f08800: This is",
+            "#f07788 \n#",
+        ]
+    def expect_match(output, expected_output):
+        # HACK: GPTQ lora outputs are just incredibly unstable.
+        # Assert that the outputs changed.
+        if (model.quantization == "GPTQ"
+                and expected_output is expected_lora_output):
+            assert output != expected_no_lora_output
+            for i, o in enumerate(output):
+                assert o.startswith(
+                    '#'), f"Expected example {i} to start with # but got {o}"
+            return
+        assert output == expected_output
+    max_tokens = 10
+    print("lora adapter created")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=0,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_no_lora_output)
+    print("lora 1")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=1,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_lora_output)
+    print("no lora")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=0,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_no_lora_output)
+    print("lora 2")
+    output = do_sample(llm,
+                       tinyllama_lora_files,
+                       lora_id=2,
+                       max_tokens=max_tokens)
+    expect_match(output, expected_lora_output)
+    print("removing lora")
+    del llm
+    cleanup()
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.skip("Requires multiple GPUs")
+def test_quant_model_tp_equality(tinyllama_lora_files, model):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < 2:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
+    llm_tp1 = vllm.LLM(model=model.model_path,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=1,
+                       quantization=model.quantization,
+                       trust_remote_code=True)
+    output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
+    del llm_tp1
+    cleanup()
+    llm_tp2 = vllm.LLM(model=model.model_path,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=2,
+                       quantization=model.quantization)
+    output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
+    del llm_tp2
+    cleanup()
+    assert output_tp1 == output_tp2
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,8 +3,8 @@ import random
 import tempfile
 from unittest.mock import patch
-from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         SchedulerConfig)
+                         ModelConfig, ParallelConfig, SchedulerConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.worker.worker import Worker
@@ -18,15 +18,21 @@ def test_worker_apply_lora(sql_lora_files):
            "meta-llama/Llama-2-7b-hf",
            tokenizer_mode="auto",
            trust_remote_code=False,
-            download_dir=None,
-            load_format="dummy",
            seed=0,
            dtype="float16",
            revision=None,
        ),
+        load_config=LoadConfig(
+            download_dir=None,
+            load_format="dummy",
+        ),
        parallel_config=ParallelConfig(1, 1, False),
        scheduler_config=SchedulerConfig(32, 32, 32),
        device_config=DeviceConfig("cuda"),
+        cache_config=CacheConfig(block_size=16,
+                                 gpu_memory_utilization=1.,
+                                 swap_space=0,
+                                 cache_dtype="auto"),
        local_rank=0,
        rank=0,
        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,

--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
+import os
+import huggingface_hub.constants
+import pytest
+from vllm.model_executor.model_loader.weight_utils import enable_hf_transfer
+def test_hf_transfer_auto_activation():
+    if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
+        # in case it is already set, we can't test the auto activation
+        pytest.skip(
+            "HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
+    enable_hf_transfer()
+    try:
+        # enable hf hub transfer if available
+        import hf_transfer  # type: ignore # noqa
+        HF_TRANFER_ACTIVE = True
+    except ImportError:
+        HF_TRANFER_ACTIVE = False
+    assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER ==
+            HF_TRANFER_ACTIVE)
+if __name__ == "__main__":
+    test_hf_transfer_auto_activation()
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
+"""Compare the outputs of a AQLM model between vLLM and HF Transformers
+Run `pytest tests/models/test_aqlm.py`.
+"""
+import pytest
+import torch
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+aqlm_not_supported = (capability <
+                      QUANTIZATION_METHODS["aqlm"].get_min_capability())
+# In this test we hardcode prompts and generations for the model so we don't
+# need to require the AQLM package as a dependency
+example_prompts = [
+    'vLLM is a high-throughput and memory-efficient inference and serving '
+    'engine for LLMs.\n',
+    'Briefly describe the major milestones in the development of artificial '
+    'intelligence from 1950 to 2020.\n',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information.\n',
+    'Describe the basic components of a neural network and how it can be '
+    'trained.\n',
+    'Write a short story about a robot that dreams for the first time.\n',
+    'Analyze the impact of the COVID-19 pandemic on global economic structures '
+    'and future business models.\n',
+    'Explain the cultural significance of the Mona Lisa painting, and how its '
+    'perception might vary in Western versus Eastern societies.\n',
+    "Translate the following English sentence into Japanese, French, and "
+    "Swahili: 'The early bird catches the worm.'\n"
+]
+# These ground truth generations were generated using `transformers==4.38.1
+# aqlm==1.1.0 torch==2.2.0`
+# and the below code:
+# ```python
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
+# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
+# torch_dtype="auto", device_map="cuda").cuda()
+# tokenizer = AutoTokenizer.from_pretrained(model_id)
+# outputs = []
+# for prompt in example_prompts:
+#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
+#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
+# print(outputs)
+# ```
+ground_truth_generations = [
+    '\n### Features\n\n- **High-throughput**: v',
+    'The major milestones in the development of artificial intelligence from '
+    '195',
+    'Compare and contrast artificial intelligence with human intelligence in '
+    'terms of processing information. The',
+    'Explain the difference between supervised and unsupervised learning.'
+    '\nExplain',
+    'Write a short story about a robot that dreams for the first time. The',
+    'Analyze the impact of the COVID-19 pandemic on global economic',
+    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
+    'The early bird catches the worm.\nThe early bird catches the'
+]
+@pytest.mark.skipif(aqlm_not_supported,
+                    reason="AQLM is not supported on this GPU type.")
+@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("num_logprobs", [1])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+    # loop through the prompts to compare against the ground truth generations
+    for prompt_idx in range(len(example_prompts)):
+        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
+            prompt_idx]
+        print("Prompt:          ", repr(example_prompts[prompt_idx]))
+        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
+        print("Output output:   ", repr(vllm_output_str))
+        assert vllm_output_str == ground_truth_generations[prompt_idx]
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -16,13 +16,12 @@ from dataclasses import dataclass
 import pytest
 import torch
-from vllm.model_executor.layers.quantization import (
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-    _QUANTIZATION_CONFIG_REGISTRY)
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (
+marlin_not_supported = (capability <
-    capability < _QUANTIZATION_CONFIG_REGISTRY["marlin"].get_min_capability())
+                        QUANTIZATION_METHODS["marlin"].get_min_capability())
 @dataclass
@@ -47,7 +46,7 @@ model_pairs = [
 @pytest.mark.parametrize("model_pair", model_pairs)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [3])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
    vllm_runner,
    example_prompts,

--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -12,7 +12,7 @@ MODELS = [
    "gpt2",
    "bigcode/tiny_starcoder_py",
    "EleutherAI/pythia-70m",
-    "bigscience/bloom-560m",
+    "bigscience/bloom-560m",  # Testing alibi slopes.
    "microsoft/phi-2",
    "stabilityai/stablelm-3b-4e1t",
    # "allenai/OLMo-1B",  # Broken

--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
+import torch
+from vllm import LLM, ModelRegistry, SamplingParams
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+class MyOPTForCausalLM(OPTForCausalLM):
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        logits.zero_()
+        logits[:, 0] += 1.0
+        return logits
+def test_oot_registration():
+    # register our dummy model
+    ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
+    prompts = ["Hello, my name is", "The text does not matter"]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(model="facebook/opt-125m")
+    first_token = llm.get_tokenizer().decode(0)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        # make sure only the first token is generated
+        rest = generated_text.replace(first_token, "")
+        assert rest == ""
--- a/tests/quantization/test_autogptq_marlin_configs.py
+++ b/tests/quantization/test_autogptq_marlin_configs.py
+"""Tests whether Marlin models can be loaded from the autogptq config.
+Run `pytest tests/quantization/test_autogptq_marlin_configs.py --forked`.
+"""
+from dataclasses import dataclass
+import pytest
+from vllm.config import ModelConfig
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+# Model Id // Expected Kernel
+MODELS_QUANT_TYPE = [
+    # compat: autogptq <=0.7.1 is_marlin_format: bool
+    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin"),
+    ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq"),
+    # compat: autogptq >=0.8.0 use checkpoint_format: str
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin"),
+    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq")
+]
+@pytest.mark.parametrize("model_quant_type", MODELS_QUANT_TYPE)
+def test_auto_gptq(model_quant_type: str, ) -> None:
+    model_path, quant_type = model_quant_type
+    model_config_no_quant_arg = ModelConfig(
+        model_path,
+        model_path,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        quantization=None  # case 1
+    )
+    model_config_quant_arg = ModelConfig(
+        model_path,
+        model_path,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        quantization="gptq"  # case 2
+    )
+    assert model_config_no_quant_arg.quantization == quant_type, (
+        f"Expected quant_type == {quant_type} for {model_path}, "
+        f"but found {model_config_no_quant_arg.quantization} "
+        "for no --quantization None case")
+    assert model_config_quant_arg.quantization == quant_type, (
+        f"Expected quant_type == {quant_type} for {model_path}, "
+        f"but found {model_config_quant_arg.quantization} "
+        "for --quantization gptq case")
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
+"""Tests whether FP8 computation is enabled correctly.
+Run `pytest tests/quantization/test_fp8.py --forked`.
+"""
+import pytest
+import torch
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+    reason="FP8 is not supported on this GPU type.")
+def test_load_fp16_model(vllm_runner) -> None:
+    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+    fc1 = model.model.decoder.layers[0].fc1
+    assert isinstance(fc1.linear_method, Fp8LinearMethod)
+    assert fc1.weight.dtype == torch.float8_e4m3fn
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
+import pytest
+import torch
+from vllm import SamplingParams
+MODELS = ["facebook/opt-125m"]
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_logits_processor_force_generate(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype)
+    tokenizer = vllm_model.model.get_tokenizer()
+    repeat_times = 2
+    enforced_answers = " vLLM"
+    vllm_token_ids = tokenizer.encode(enforced_answers,
+                                      add_special_tokens=False)
+    max_tokens = len(vllm_token_ids) * repeat_times
+    def pick_vllm(token_ids, logits):
+        token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
+        logits[token_id] = torch.finfo(logits.dtype).max
+        return logits
+    params_with_logprobs = SamplingParams(
+        logits_processors=[pick_vllm],
+        prompt_logprobs=3,
+        max_tokens=max_tokens,
+    )
+    # test logits_processors when prompt_logprobs is not None
+    vllm_model.model._add_request(
+        prompt=example_prompts[0],
+        sampling_params=params_with_logprobs,
+        prompt_token_ids=None,
+    )
+    # test prompt_logprobs is not None
+    vllm_model.model._add_request(
+        prompt=example_prompts[1],
+        sampling_params=SamplingParams(
+            prompt_logprobs=3,
+            max_tokens=max_tokens,
+        ),
+        prompt_token_ids=None,
+    )
+    # test grouped requests
+    vllm_model.model._add_request(
+        prompt=example_prompts[2],
+        sampling_params=SamplingParams(max_tokens=max_tokens),
+        prompt_token_ids=None,
+    )
+    outputs = vllm_model.model._run_engine(False)
+    assert outputs[0].outputs[0].text == enforced_answers * repeat_times
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -91,12 +91,16 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
        bonus_token_ids,
    )
+    # Bonus tokens are currently disabled. Verify they're set to -1.
+    # See https://github.com/vllm-project/vllm/issues/4212
+    expected_bonus_token_ids = bonus_token_ids.clone() * 0 - 1
    if which_tokens_accepted == "all_tokens_accepted":
        # Expect all tokens to be equal to draft tokens.
        assert torch.equal(output_token_ids[:, :-1], draft_token_ids)
        # Expect all bonus tokens to be included.
-        assert torch.equal(output_token_ids[:, -1:], bonus_token_ids)
+        assert torch.equal(output_token_ids[:, -1:], expected_bonus_token_ids)
    elif which_tokens_accepted == "no_tokens_accepted":
        # Expect first token to be equal to recovered tokens.
        assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0])
@@ -106,7 +110,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
                           torch.ones_like(output_token_ids[:, 1:]) * -1)
    elif which_tokens_accepted == "some_tokens_accepted":
        recovered_plus_bonus = torch.cat(
-            (recovered_token_ids, bonus_token_ids), dim=-1)
+            (recovered_token_ids, expected_bonus_token_ids), dim=-1)
        # Assert first rejected token is a recovered token or bonus token.
        assert torch.equal(
            recovered_plus_bonus[torch.arange(0, batch_size),

--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
+import itertools
 import random
 from typing import List, Optional, Tuple
 from unittest.mock import patch
@@ -31,7 +32,12 @@ def _prepare_test(
                             1e-2,
                             dtype=input_tensor.dtype)
    sampler = MockLogitsSampler(fake_logits)
-    model_runner = ModelRunner(None, None, None, None, None)
+    model_runner = ModelRunner(model_config=None,
+                               parallel_config=None,
+                               scheduler_config=None,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
    return input_tensor, fake_logits, sampler, model_runner
@@ -194,11 +200,15 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
    def create_sampling_params(min_tokens,
                               eos_token_id=0,
-                               stop_token_ids=None):
+                               *,
+                               stop_token_ids: Optional[List[str]] = None,
+                               prompt_logprobs: Optional[int] = None):
        sampling_params = SamplingParams(
            min_tokens=min_tokens,
            max_tokens=9999,  # keep higher than max of min_tokens
            stop_token_ids=stop_token_ids,
+            # requesting prompt_logprobs changes the structure of `logits`
+            prompt_logprobs=prompt_logprobs,
        )
        sampling_params.eos_token_id = eos_token_id
        return sampling_params
@@ -217,9 +227,9 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
        expected_penalization = []
        sequence_metadata_list = []
+        # 20% chance to generate seq group metadata list with all prompts
+        is_prompt = random.random() < 0.2
        while batch_size > 0:
-            # 20% chance to generate prompt seq group with single sequence
-            is_prompt = random.random() < 0.2
            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
@@ -240,7 +250,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
            seq_group_penalization = []
            for _ in range(num_seqs):
                num_input = random.randint(1, 100)
-                num_generated = random.randint(1, 100) if not is_prompt else 0
+                num_generated = 0 if is_prompt else random.randint(1, 100)
                seq_data[next(seq_id_counter)] = create_sequence_data(
                    num_input=num_input, num_generated=num_generated)
                seq_group_penalization.append(num_generated < min_tokens)
@@ -292,6 +302,21 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
        ]
    }
+    prompt_with_penalization_and_prompt_logprobs = {
+        "expected_penalization": [False, False, True],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_1",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=3),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+        ]
+    }
    stop_penalizing_after_min_tokens = {
        "expected_penalization": [False],
        "seq_group_metadata_list": [
@@ -309,8 +334,34 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
    }
    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
-    simple_combination = {
+    prompt_combination = {
-        "expected_penalization": [True, False, False],
+        "expected_penalization": [False, True, False],
+        "seq_group_metadata_list": [
+            SequenceGroupMetadata(
+                request_id="test_2",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(num_input=2),
+                },
+                sampling_params=create_sampling_params(1, prompt_logprobs=3),
+                block_tables={},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_3",
+                is_prompt=True,
+                seq_data={
+                    next(seq_id_counter): create_sequence_data(),
+                },
+                sampling_params=create_sampling_params(
+                    0, stop_token_ids=stop_token_ids),
+                block_tables={},
+            )
+        ]
+    }
+    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
+    decode_combination = {
+        "expected_penalization": [True, False, False, True, False],
        "seq_group_metadata_list": [
            SequenceGroupMetadata(
                request_id="test_1",
@@ -327,14 +378,19 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
            ),
            SequenceGroupMetadata(
                request_id="test_2",
-                is_prompt=True,
+                is_prompt=False,
                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=20),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=1),
+                    next(seq_id_counter):
+                    create_sequence_data(num_generated=10),
                },
                sampling_params=create_sampling_params(
-                    0, stop_token_ids=stop_token_ids),
+                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
                block_tables={},
-            )
+            ),
        ]
    }
@@ -342,8 +398,10 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
        test_cases = [
            prompt_without_penalization,
            prompt_with_penalization,
+            prompt_with_penalization_and_prompt_logprobs,
            stop_penalizing_after_min_tokens,
-            simple_combination,
+            prompt_combination,
+            decode_combination,
        ]
    else:
        test_cases = [generate_test_case()]
@@ -351,30 +409,49 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
    def run_test_case(*,
                      expected_penalization=None,
                      seq_group_metadata_list=None):
-        assert expected_penalization, "Invalid test case"
+        assert expected_penalization, \
-        assert seq_group_metadata_list, "Invalid test case"
+            "Invalid test case, need expected_penalization"
+        assert seq_group_metadata_list, \
+            "Invalid test case, need seq_group_metadata_list"
        batch_size = 0
        prompt_lens = []
-        sampling_params_per_seq = []
+        sampling_params_per_row = []
        for sgm in seq_group_metadata_list:
-            num_seqs = len(sgm.seq_data)
-            batch_size += num_seqs
            sampling_params = sgm.sampling_params
-            for seq_id in sgm.seq_data:
-                prompt_lens.append(sgm.seq_data[seq_id].get_prompt_len())
+            num_rows = len(sgm.seq_data)
-                sampling_params_per_seq.append(sampling_params)
+            if sgm.is_prompt:
+                # a prompt seq_group has only one sequence
+                seq_data = next(iter(sgm.seq_data.values()))
+                prompt_len = seq_data.get_prompt_len()
+                prompt_lens.append(prompt_len)
+                if sgm.sampling_params.prompt_logprobs:
+                    # with prompt_logprobs each token in the prompt has a row in
+                    # logits
+                    num_rows = prompt_len
+            batch_size += num_rows
+            sampling_params_per_row.extend(
+                itertools.repeat(sampling_params, num_rows))
+        assert len(
+            expected_penalization
+        ) == batch_size, \
+            ("Invalid test case, expected_penalization does not match computed"
+             "batch size")
        _, fake_logits, sampler, model_runner = _prepare_test(batch_size)
        sampling_metadata = model_runner._prepare_sample(
            seq_group_metadata_list,
-            prompt_lens=prompt_lens,
+            prompt_lens=prompt_lens if prompt_lens else None,
-            subquery_lens=prompt_lens)
+            subquery_lens=prompt_lens if prompt_lens else None)
        # the logits tensor is modified in-place by the sampler
        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
        for logits_idx, (should_penalize, sampling_params) in enumerate(
-                zip(expected_penalization, sampling_params_per_seq)):
+                zip(expected_penalization, sampling_params_per_row)):
            tokens_to_check = [sampling_params.eos_token_id]
            if sampling_params.stop_token_ids:
@@ -519,7 +596,12 @@ def test_sampler_top_k_top_p(seed: int, device: str):
                               device=input_tensor.device,
                               dtype=input_tensor.dtype)
    sampler = MockLogitsSampler(fake_logits)
-    model_runner = ModelRunner(None, None, None, None, None)
+    model_runner = ModelRunner(model_config=None,
+                               parallel_config=None,
+                               scheduler_config=None,
+                               device_config=None,
+                               load_config=None,
+                               lora_config=None)
    generation_model = GenerationMixin()
    generation_config = GenerationConfig(top_k=top_k,
@@ -554,7 +636,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
    def mock_sample(probs, *args, **kwargs):
        nonlocal sample_probs
        sample_probs = probs
-        return [[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs]
+        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
+                 for prob in probs], None)
    with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)

--- a/vllm/model_executor/parallel_utils/__init__.py
+++ b/vllm/model_executor/parallel_utils/__init__.py
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
+from typing import List, Tuple
+import pytest
+from tests.conftest import cleanup
+from vllm import LLM
+from vllm.model_executor.utils import set_random_seed
+@pytest.fixture
+def baseline_llm_generator(request, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           seed):
+    return create_llm_generator("baseline", request, common_llm_kwargs,
+                                per_test_common_llm_kwargs,
+                                baseline_llm_kwargs, seed)
+@pytest.fixture
+def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs,
+                       test_llm_kwargs, seed):
+    return create_llm_generator("test", request, common_llm_kwargs,
+                                per_test_common_llm_kwargs, test_llm_kwargs,
+                                seed)
+def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
+                         per_test_common_llm_kwargs, distinct_llm_kwargs,
+                         seed):
+    kwargs = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **distinct_llm_kwargs,
+    }
+    test_name = request.node.name
+    def generator_inner():
+        print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
+        llm = LLM(**kwargs)
+        set_random_seed(seed)
+        yield llm
+        del llm
+        cleanup()
+    def generator_outer():
+        for llm in generator_inner():
+            yield llm
+            del llm
+    return generator_outer
+def get_output_from_llm_generator(
+        llm_generator, prompts,
+        sampling_params) -> Tuple[List[str], List[List[int]]]:
+    tokens = []
+    token_ids = []
+    for llm in llm_generator():
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        token_ids = [output.outputs[0].token_ids for output in outputs]
+        tokens = [output.outputs[0].text for output in outputs]
+        del llm
+    return tokens, token_ids