Merge tag 'v0.7.1' into v0.7.1-dev

afd0da21 · zhuwenwen · 1a11f127 · 4f4d427a · afd0da21 · afd0da21
Commit afd0da21 authored Feb 03, 2025 by zhuwenwen
20 changed files
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
        assert buffer.buffer_size == 0
        assert len(buffer.buffer) == 0
-    print("My rank: %d, device: %s" % (my_rank, device))
+    print(f"My rank: {my_rank}, device: {device}")
    # insert
    tokens = torch.tensor([1, 2, 3]).to(device)
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
        assert buffer.buffer_size == 0
        assert len(buffer.buffer) == 0
-    print("My rank: %d, Test run passed!" % (my_rank))
+    print(f"My rank: {my_rank}, Test run passed!")
 def stress_test(my_rank, buf, device):
@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
                assert torch.allclose(k, k_)
                assert torch.allclose(v, v_)
                assert torch.allclose(h, h_)
-    print('Rank %d done' % my_rank)
+    print(f"Rank {my_rank} done")
    torch.distributed.barrier()
    if my_rank == 0:
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
    else:
        torch.distributed.send(torch.tensor([n]), 0)
-    print("My rank: %d, Passed stress test!" % (my_rank))
+    print(f"My rank: {my_rank}, Passed stress test!")
 if __name__ == "__main__":
@@ -122,7 +122,7 @@ if __name__ == "__main__":
        rank=my_rank,
    )
-    print("initialized! My rank is %d" % my_rank)
+    print(f"initialized! My rank is {my_rank}")
    config = KVTransferConfig(
        kv_connector='PyNcclConnector',

--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -22,13 +22,13 @@ def test_run(my_rank, pipe):
        x2 = pipe.recv_tensor()
        print(f"rank {my_rank} received x2 = ", x2)
        y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", y2)
    else:
        x2 = pipe.recv_tensor()
        print(f"rank {my_rank} received x2 = ", x2)
        y2 = pipe.recv_tensor()
-        print(f"rank {my_rank} received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", y2)
        pipe.send_tensor(x)
        print(f"rank {my_rank} sent tensor x")
        pipe.send_tensor(y)

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -5,6 +5,7 @@ from unittest.mock import MagicMock, patch
 import pytest
 import os
+import safetensors
 import torch
 import torch.nn as nn
@@ -22,6 +23,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
+from vllm.platforms import current_platform
 from ..utils import models_path_prefix
@@ -67,13 +69,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
 @pytest.fixture
 def dist_init():
    temp_file = tempfile.mkstemp()[1]
-    init_distributed_environment(
-        world_size=1,
+    backend = "nccl"
-        rank=0,
+    if current_platform.is_cpu():
-        distributed_init_method=f"file://{temp_file}",
+        backend = "gloo"
-        local_rank=0,
-        backend="nccl",
+    init_distributed_environment(world_size=1,
-    )
+                                 rank=0,
+                                 distributed_init_method=f"file://{temp_file}",
+                                 local_rank=0,
+                                 backend=backend)
    initialize_model_parallel(1, 1)
    yield
    cleanup_dist_env_and_memory(shutdown_ray=True)
@@ -83,13 +88,15 @@ def dist_init():
 def dist_init_torch_only():
    if torch.distributed.is_initialized():
        return
+    backend = "nccl"
+    if current_platform.is_cpu():
+        backend = "gloo"
    temp_file = tempfile.mkstemp()[1]
-    torch.distributed.init_process_group(
+    torch.distributed.init_process_group(world_size=1,
-        backend="nccl",
+                                         rank=0,
-        world_size=1,
+                                         init_method=f"file://{temp_file}",
-        rank=0,
+                                         backend=backend)
-        init_method=f"file://{temp_file}",
-    )
 @pytest.fixture
@@ -173,6 +180,29 @@ def mixtral_lora_files_all_target_modules():
    return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
+@pytest.fixture(scope="session")
+def jamba_lora_files():
+    #   some of the adapters have unnecessary weights for serving,
+    #   hence we remove them
+    def remove_unnecessary_weights(path):
+        lora_path = f"{adapter_path}/adapter_model.safetensors"
+        tensors = safetensors.torch.load_file(lora_path)
+        nonlora_keys = []
+        for k in list(tensors.keys()):
+            if "lora" not in k:
+                nonlora_keys.append(k)
+        for k in nonlora_keys:
+            del tensors[k]
+        safetensors.torch.save_file(tensors, lora_path)
+    adapter_path = snapshot_download(
+        repo_id=
+        "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
+    remove_unnecessary_weights(adapter_path)
+    return adapter_path
 @pytest.fixture(scope="session")
 def gemma_lora_files():
    # return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")

--- a/tests/lora/test_jamba.py
+++ b/tests/lora/test_jamba.py
+from typing import List
+import pytest
+import torch
+import vllm
+from vllm.lora.request import LoRARequest
+MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
+MAX_TOKENS = 40
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
+              prompts: List[str]) -> List[str]:
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+@pytest.mark.parametrize("tp_size", [4])
+def test_jamba_lora(jamba_lora_files, tp_size):
+    """Original test, the LoRA model has the common target modules, not all"""
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+    prompts = ["Write a story about a sheep and a goat."]
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        distributed_executor_backend="ray",
+        tensor_parallel_size=tp_size,
+    )
+    expected_jamba_output = [
+        """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming"""  # noqa: E501
+    ]
+    assert do_sample(llm, jamba_lora_files, lora_id=1,
+                     prompts=prompts) == expected_jamba_output
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -48,10 +48,14 @@ TOLERANCES = {
    torch.float32: (5e-3, 5e-3),
    torch.bfloat16: (3e-2, 2e-2),
 }
-# TODO: Modify this based on platform
-DEVICES = [
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
+    reason="Backend not supported")
+DEVICES = ([
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+] if current_platform.is_cuda_alike() else ["cpu"])
 #For GPU, we will launch different triton kernels between the prefill and decode
 # stages, so we need to verify this. prefill stage(True) or decode stage(False)
@@ -198,6 +202,10 @@ def check_punica_wrapper(punica_wrapper) -> bool:
        from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
        return type(punica_wrapper) is PunicaWrapperGPU
+    elif current_platform.is_cpu():
+        from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
+        return type(punica_wrapper) is PunicaWrapperCPU
    else:
        return False
@@ -211,7 +219,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
    # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
    # device, see: https://github.com/triton-lang/triton/issues/2925
    # Same below.
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
    torch.set_default_device(device)
    max_loras = 8
@@ -313,7 +322,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
                                        vocab_size, stage) -> None:
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
    torch.set_default_device(device)
    max_loras = 8
    punica_wrapper = get_punica_wrapper(8192, 256, device)
@@ -450,7 +461,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
 def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
                                  stage) -> None:
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
    torch.set_default_device(device)
    max_loras = 8
    punica_wrapper = get_punica_wrapper(8192, 256, device)
@@ -582,7 +595,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
 def test_linear_replicated(dist_init, num_loras, device, stage,
                           bias_enabled) -> None:
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device)
    assert check_punica_wrapper(punica_wrapper)
@@ -695,7 +710,9 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
                         device, stage, bias_enabled) -> None:
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device)
    assert check_punica_wrapper(punica_wrapper)
@@ -818,7 +835,9 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                device, stage, bias_enabled) -> None:
-    torch.cuda.set_device(device)
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
    torch.set_default_device(device)
    punica_wrapper = get_punica_wrapper(8192, 256, device)
    assert check_punica_wrapper(punica_wrapper)
@@ -971,6 +990,8 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
 @pytest.mark.parametrize("rotary_dim", [None, 32])
 @pytest.mark.parametrize("head_size", [32, 108])
 @pytest.mark.parametrize("seq_len", [11, 1024])
+@pytest.mark.skipif(not current_platform.is_cuda_alike(),
+                    reason="Only CUDA backends are supported")
 def test_rotary_embedding_long_context(dist_init, num_loras, device,
                                       scaling_factors, max_position,
                                       is_neox_style, rotary_dim, head_size,

--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -3,6 +3,7 @@ from typing import List
 import pytest
 from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper
@@ -30,11 +31,14 @@ def test_load_checkpoints(
        else:
            expected_lora_modules.append(module)
    if lora_name == "baichuan7B":
+        peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                                max_position_embeddings=4096)
        # For the baichuan7B model, load it's LoRA,
        # and the test should pass.
        LoRAModel.from_local_checkpoint(
            baichuan_lora_files,
            expected_lora_modules,
+            peft_helper=peft_helper,
            lora_model_id=1,
            device="cpu",
            embedding_modules=embedding_modules,
@@ -43,9 +47,12 @@ def test_load_checkpoints(
        # Test that the target_modules contain prefix
        # such as "model.layers.0.self_atten.W_pack", and
        # the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
+                                                max_position_embeddings=4096)
        LoRAModel.from_local_checkpoint(
            baichuan_zero_lora_files,
            expected_lora_modules,
+            peft_helper=peft_helper,
            lora_model_id=1,
            device="cpu",
            embedding_modules=embedding_modules,
@@ -53,9 +60,12 @@ def test_load_checkpoints(
    elif lora_name == "baichuan7B-zero-regex":
        # Test that the `target_modules` in the form of regular expressions,
        # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
+        peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
+                                                max_position_embeddings=4096)
        LoRAModel.from_local_checkpoint(
            baichuan_regex_lora_files,
            expected_lora_modules,
+            peft_helper=peft_helper,
            lora_model_id=1,
            device="cpu",
            embedding_modules=embedding_modules,
@@ -64,10 +74,13 @@ def test_load_checkpoints(
        # For the baichuan7B model, load chatglm3-6b's LoRA,
        # and the test should raise the following error.
        expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
+        peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
+                                                max_position_embeddings=4096)
        with pytest.raises(ValueError, match=expected_error):
            LoRAModel.from_local_checkpoint(
                chatglm3_lora_files,
                expected_lora_modules,
+                peft_helper=peft_helper,
                lora_model_id=1,
                device="cpu",
                embedding_modules=embedding_modules,
@@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files):
            ".layers.": ".baichuan_layers.",
        },
    )
+    peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
+                                            max_position_embeddings=4096)
    lora_model = LoRAModel.from_local_checkpoint(
        baichuan_lora_files,
        expected_lora_modules,
+        peft_helper=peft_helper,
        lora_model_id=1,
        device="cpu",
        embedding_modules=embedding_modules,

--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -3,6 +3,7 @@ from typing import List
 import pytest
 from vllm.lora.models import LoRAModel
+from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.model_executor.models.llama import LlamaForCausalLM
@@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
    lora_path = get_adapter_absolute_path(lora_name)
    # lora loading should work for either absolute path and hugggingface id.
+    peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
    lora_model = LoRAModel.from_local_checkpoint(
        lora_path,
        expected_lora_modules,
+        peft_helper=peft_helper,
        lora_model_id=1,
        device="cpu",
        embedding_modules=embedding_modules,

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
-import json
 import os
 from typing import Dict, List
@@ -19,6 +18,7 @@ from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                      WorkerLoRAManager)
 from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.platforms import current_platform
 EMBEDDING_MODULES = {
    "embed_tokens": "input_embeddings",
@@ -27,68 +27,20 @@ EMBEDDING_MODULES = {
 EMBEDDING_PADDING_MODULES = ["lm_head"]
-CUDA_DEVICES = [
+DEVICES = ([
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
+] if current_platform.is_cuda_alike() else ["cpu"])
-def test_peft_helper(sql_lora_files):
+@pytest.mark.parametrize("device", DEVICES)
-    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
-    with open(lora_config_path) as f:
-        config = json.load(f)
-    peft_helper = PEFTHelper.from_dict(config)
-    assert peft_helper.r == 8
-    assert peft_helper.lora_alpha == 16
-    assert peft_helper.target_modules == [
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
-    expected_error = "vLLM only supports modules_to_save being None."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(
-            r=8,
-            lora_alpha=16,
-            target_modules=["gate_proj"],
-            modules_to_save=["lm_head"],
-        )
-        PEFTHelper.from_dict(config)
-    expected_error = "vLLM does not yet support RSLoRA."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(r=8,
-                      lora_alpha=16,
-                      target_modules=["gate_proj"],
-                      use_rslora=True)
-        PEFTHelper.from_dict(config)
-    expected_error = "vLLM does not yet support DoRA."
-    with pytest.raises(ValueError, match=expected_error):
-        config = dict(r=8,
-                      lora_alpha=16,
-                      target_modules=["gate_proj"],
-                      use_dora=True)
-        PEFTHelper.from_dict(config)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
    tensors = load_file(
        os.path.join(sql_lora_files, "adapter_model.safetensors"))
    new_embeddings = load_file(
        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
-    lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
+    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
-    with open(lora_config_path) as f:
+                                            max_position_embeddings=4096)
-        config = json.load(f)
-    peft_helper = PEFTHelper.from_dict(config)
    lora_model = LoRAModel.from_lora_tensors(
        1,
        tensors,
@@ -165,7 +117,7 @@ def test_replace_submodules(dist_init, dummy_model):
    manager = LoRAModelManager(
        model, 1, 1, 1,
        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
-        torch.device("cuda"))
+        torch.device(DEVICES[0]))
    model = manager.model
    assert isinstance(model.get_submodule("dense1"),
@@ -177,7 +129,7 @@ def test_replace_submodules(dist_init, dummy_model):
                      RowParallelLinearWithLoRA)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lora_model_manager(dist_init, dummy_model, device):
    model = dummy_model
    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -238,7 +190,7 @@ def test_lora_model_manager(dist_init, dummy_model, device):
    assert manager.punica_wrapper.device == device
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
    model = dummy_model
    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
@@ -330,7 +282,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
    assert manager.device == device
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lru_lora_model_manager(dist_init, dummy_model, device):
    # This tests just the LRU cache functionality, everything else is
    # tested in test_lora_model_manager
@@ -460,7 +412,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
    assert manager.device == device
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                          sql_lora_files, device):
    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
@@ -539,7 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
            device)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                sql_lora_files, device):
    # Should remove every LoRA not specified in the request.
@@ -615,7 +567,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
            device)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_packed_loras(dist_init, dummy_model_gate_up, device):
    model = dummy_model_gate_up
    model.supported_lora_modules = ["gate_up_proj"]

--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -4,10 +4,11 @@ import os
 import pytest
 import vllm
+from tests.utils import fork_new_process_for_each_test
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
-from ..utils import multi_gpu_test, models_path_prefix
+from ..utils import models_path_prefix
 MODEL_PATH = os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")
@@ -18,13 +19,11 @@ PROMPT_TEMPLATE = (
 IMAGE_ASSETS = [
    ImageAsset("stop_sign"),
-    ImageAsset("cherry_blossom"),
 ]
 # After fine-tuning with LoRA, all generated content should start begin `A`.
 EXPECTED_OUTPUT = [
    "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.",  # noqa: E501
-    "A pink cherry blossom tree with a blue sky in the background.",
 ]
@@ -51,48 +50,75 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    # Print the outputs.
    generated_texts: List[str] = []
    for output in outputs:
-        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Generated text: {generated_text!r}")
    return generated_texts
-@multi_gpu_test(num_gpus=2)
+@pytest.mark.xfail(
-@pytest.mark.parametrize("fully_sharded", [True, False])
+    current_platform.is_rocm(),
-def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded):
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_lora(minicpmv_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=8,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )
+    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
+    output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        max_num_seqs=2,
        max_loras=4,
        max_lora_rank=64,
-        tensor_parallel_size=2,
+        tensor_parallel_size=4,
        trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
+        enforce_eager=True,
        enable_chunked_prefill=True,
    )
    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
    for i in range(len(EXPECTED_OUTPUT)):
        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
-@multi_gpu_test(num_gpus=4)
+@pytest.mark.xfail(
-@pytest.mark.parametrize("fully_sharded", [True, False])
+    current_platform.is_rocm(),
-def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded):
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+@fork_new_process_for_each_test
+def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
        enable_lora=True,
        max_num_seqs=2,
-        max_loras=4,
+        max_loras=2,
-        max_lora_rank=64,
+        max_lora_rank=8,
        tensor_parallel_size=4,
        trust_remote_code=True,
-        fully_sharded_loras=fully_sharded,
+        fully_sharded_loras=True,
        enable_chunked_prefill=True,
    )
    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
    for i in range(len(EXPECTED_OUTPUT)):
        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
+    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -6,6 +6,7 @@ import os
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 from ..utils import models_path_prefix
 MODEL_PATH = os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1")
@@ -33,7 +34,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
 @pytest.mark.parametrize("tp_size", [4])
 def test_mixtral_lora(mixtral_lora_files, tp_size):
    """Original test, the LoRA model has the common target modules, not all"""
-    if torch.cuda.device_count() < tp_size:
+    if torch.cuda.device_count(
+    ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
    prompts = [

--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
+import json
+import math
+import shutil
+import pytest
+from vllm.config import LoRAConfig
+from vllm.lora.peft_helper import PEFTHelper
+ERROR_CASES = [
+    (
+        "test_rank",
+        {
+            "r": 1024
+        },
+        "is greater than max_lora_rank",
+    ),
+    (
+        "test_bias",
+        {
+            "bias": "all"
+        },
+        "Adapter bias cannot be used without bias_enabled",
+    ),
+    ("test_dora", {
+        "use_dora": True
+    }, "does not yet support DoRA"),
+    (
+        "test_modules_to_save",
+        {
+            "modules_to_save": ["lm_head"]
+        },
+        "only supports modules_to_save being None",
+    ),
+]
+def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
+    peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1,
+                                            max_position_embeddings=4096)
+    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
+    peft_helper.validate_legal(lora_config)
+    assert peft_helper.r == 8
+    assert peft_helper.lora_alpha == 16
+    assert peft_helper.target_modules == [
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    assert peft_helper.context_length == 16384
+    assert peft_helper.vllm_max_position_embeddings == 4096
+    assert peft_helper.vllm_long_context_scaling_factor == float(
+        math.ceil(peft_helper.context_length /
+                  peft_helper.vllm_max_position_embeddings))
+    # test RSLoRA
+    rslora_config = dict(use_rslora=True)
+    test_dir = tmp_path / "test_rslora"
+    shutil.copytree(long_context_lora_files_16k_1, test_dir)
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(rslora_config)
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+    peft_helper = PEFTHelper.from_local_dir(test_dir,
+                                            max_position_embeddings=4096)
+    peft_helper.validate_legal(lora_config)
+    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
+    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
+@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
+def test_peft_helper_error(
+    sql_lora_files,
+    tmp_path,
+    test_name: str,
+    config_change: dict,
+    expected_error: str,
+):
+    test_dir = tmp_path / test_name
+    shutil.copytree(sql_lora_files, test_dir)
+    # Load and modify configuration
+    config_path = test_dir / "adapter_config.json"
+    with open(config_path) as f:
+        adapter_config = json.load(f)
+    # Apply configuration changes
+    adapter_config.update(config_change)
+    # Save modified configuration
+    with open(config_path, "w") as f:
+        json.dump(adapter_config, f)
+    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
+    # Test loading the adapter
+    with pytest.raises(ValueError, match=expected_error):
+        PEFTHelper.from_local_dir(
+            test_dir, max_position_embeddings=4096).validate_legal(lora_config)
--- a/tests/lora/test_punica_sizes.py
+++ b/tests/lora/test_punica_sizes.py
@@ -4,19 +4,21 @@ hidden_sizes included in the LoRA models currently supported by vLLM. It tests
 whether the corresponding Triton kernel can run normally when tensor parallelism
 is set to [1, 2, 4, 8, 16, 32, 64].
 """
+from threading import Lock
 import pytest
 import torch
-from vllm.lora.ops.bgmv_expand import bgmv_expand
+import vllm.lora.ops.triton_ops  # noqa: F401
-from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-from vllm.lora.ops.bgmv_shrink import bgmv_shrink
+                                     bgmv_shrink, sgmv_expand,
-from vllm.lora.ops.sgmv_expand import sgmv_expand
+                                     sgmv_expand_slice, sgmv_shrink)
-from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 from vllm.platforms import current_platform
-from .utils import (generate_data, generate_data_for_expand_nslices,
+from .utils import (assert_close, generate_data,
-                    ref_torch_groupgemm)
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices)
 HIDDEN_SIZES = [
    128,
@@ -110,16 +112,9 @@ DTYPES = [torch.float16, torch.bfloat16]
 MAX_RANKS = [32]
 SCALES = [0.5]
 SEED = [0]
-CUDA_DEVICES = [f"cuda:{0}"]
+DEVICES = [f"cuda:{0}"]
-def assert_close(a, b):
+_dict_lock = Lock()
-    rtol, atol = {
-        torch.float16: (6e-2, 6e-2),
-        torch.bfloat16: (6e-2, 6e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 @pytest.mark.parametrize("batches", BATCHES)
@@ -127,16 +122,18 @@ def assert_close(a, b):
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_sgmv(
    batches: int,
    num_loras: int,
    rank: int,
    hidden_size: int,
    scaling: float,
+    nslices: int,
    dtype: torch.dtype,
    op_type: str,
    seed: int,
@@ -148,19 +145,20 @@ def test_punica_sgmv(
    seq_length = 128
    (
        inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
        our_out_tensor,
        ref_out_tensor,
        b_seq_start_loc,
        lora_indices_tensor,
        seq_len_tensor,
        indices,
-    ) = generate_data(
+    ) = generate_data_for_nslices(
        batches,
        hidden_size,
        num_loras,
        rank,
        seq_length,
+        nslices,
        dtype,
        op_type,
        device,
@@ -172,43 +170,85 @@ def test_punica_sgmv(
    else:
        max_seq_length = max_seq_length.item()
    if op_type == "shrink":
-        sgmv_shrink(
+        # Preventing cache error pointer.
-            inputs_tensor,
+        with _dict_lock:
-            lora_weights,
+            _LORA_A_PTR_DICT.clear()
-            our_out_tensor,
+            torch.ops.vllm.sgmv_shrink(
-            b_seq_start_loc,
+                inputs_tensor,
-            seq_len_tensor,
+                lora_weights_lst,
-            lora_indices_tensor,
+                our_out_tensor,
-            batches,
+                b_seq_start_loc,
-            max_seq_length,
+                seq_len_tensor,
-            token_nums,
+                lora_indices_tensor,
-            scaling,
+                batches,
-        )
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
+        for index in range(nslices):
+            sgmv_shrink(
+                inputs_tensor,
+                lora_weights_lst[index],
+                ref_out_tensor[index],
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
    else:
-        sgmv_expand(
+        with _dict_lock:
-            inputs_tensor,
+            _LORA_B_PTR_DICT.clear()
-            lora_weights,
+            torch.ops.vllm.sgmv_expand(
-            our_out_tensor,
+                inputs_tensor,
-            b_seq_start_loc,
+                lora_weights_lst,
-            seq_len_tensor,
+                our_out_tensor,
-            lora_indices_tensor,
+                b_seq_start_loc,
-            batches,
+                seq_len_tensor,
-            max_seq_length,
+                lora_indices_tensor,
-            token_nums,
+                batches,
-            add_inputs=True,
+                max_seq_length,
-        )
+                token_nums,
-    ref_torch_groupgemm(
+                offset_start=0,
-        ref_out_tensor,
+                add_inputs=True,
-        inputs_tensor,
+            )
-        lora_weights,
+        if nslices == 1:
-        lora_indices_tensor,
+            # Verify the torch's sgmv_expand op
-        seq_len_tensor,
+            sgmv_expand(
-        batches,
+                inputs_tensor[0],
-        scaling if op_type == "shrink" else 1.0,
+                lora_weights_lst[0],
-        op_type,
+                ref_out_tensor,
-    )
+                b_seq_start_loc,
-    if op_type == "shrink":
+                seq_len_tensor,
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                add_inputs=True,
+            )
+        else:
+            slice_offset = 0
+            for index in range(nslices):
+                lora_weights = lora_weights_lst[index]
+                sgmv_expand_slice(
+                    inputs_tensor[index],
+                    lora_weights,
+                    ref_out_tensor,
+                    b_seq_start_loc,
+                    seq_len_tensor,
+                    lora_indices_tensor,
+                    batches,
+                    max_seq_length,
+                    token_nums,
+                    slice_offset,
+                    hidden_size,
+                    add_inputs=True,
+                )
+                slice_offset += hidden_size
    assert_close(our_out_tensor, ref_out_tensor)
@@ -220,7 +260,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv(
    batches: int,
    num_loras: int,
@@ -256,31 +296,38 @@ def test_punica_bgmv(
        device,
    )
    if op_type == "shrink":
-        bgmv_shrink(
+        torch.ops.vllm.bgmv_shrink(
            inputs_tensor,
            lora_weights,
            our_out_tensor,
            indices,
            scaling,
        )
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            scaling,
+        )
    else:
-        bgmv_expand(
+        torch.ops.vllm.bgmv_expand(
            inputs_tensor,
            lora_weights,
            our_out_tensor,
            indices,
            add_inputs=True,
        )
-    ref_torch_groupgemm(
+        bgmv_expand(
-        ref_out_tensor,
+            inputs_tensor,
-        inputs_tensor,
+            lora_weights,
-        lora_weights,
+            ref_out_tensor,
-        lora_indices_tensor,
+            indices,
-        seq_len_tensor,
+            add_inputs=True,
-        batches,
+        )
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
    if op_type == "shrink":
        ref_out_tensor = ref_out_tensor.to(torch.float32)
    assert_close(our_out_tensor, ref_out_tensor)
@@ -292,25 +339,22 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
-def test_punica_expand_nslices(
+def test_punica_bgmv_expand_nslices(
    batches: int,
    num_loras: int,
    rank: int,
    hidden_size: int,
    nslices: int,
    dtype: torch.dtype,
-    op_type: str,
    seed: int,
    device: str,
 ):
    torch.set_default_device(device)
    current_platform.seed_everything(seed)
-    seq_length = 128 if op_type == "sgmv" else 1
+    seq_length = 1
    (
        inputs_tensor,
        lora_weights_lst,
@@ -330,50 +374,26 @@ def test_punica_expand_nslices(
        nslices,
        device,
    )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
    slice_offset = 0
    for index in range(nslices):
        lora_weights = lora_weights_lst[index]
-        if op_type == "sgmv":
+        torch.ops.vllm.bgmv_expand_slice(
-            sgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                slice_offset,
-                hidden_size,
-                add_inputs=True,
-            )
-        else:
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
-        ref_torch_groupgemm(
-            ref_outputs[:, slice_offset:slice_offset + hidden_size],
            inputs_tensor,
            lora_weights,
-            lora_indices_tensor,
+            our_outputs,
-            seq_len_tensor,
+            indices,
-            batches,
+            slice_offset,
-            1.0,
+            slice_size=hidden_size,
-            op_type="expand",
+            add_inputs=True,
+        )
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            ref_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
        )
        slice_offset += hidden_size

--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_variation.py
@@ -3,22 +3,24 @@ This script is mainly used to test whether trtion kernels can run normally
 under different conditions, including various batches, numbers of LoRA , and
 maximum ranks.
 """
+from threading import Lock
 import pytest
 import torch
 # Enable custom op register
-import vllm.lora.ops.bgmv_expand
+import vllm.lora.ops.triton_ops  # noqa: F401
-import vllm.lora.ops.bgmv_expand_slice
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-import vllm.lora.ops.bgmv_shrink
+                                     bgmv_shrink, sgmv_expand,
-import vllm.lora.ops.sgmv_expand
+                                     sgmv_expand_slice, sgmv_shrink)
-import vllm.lora.ops.sgmv_expand_slice
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-import vllm.lora.ops.sgmv_shrink  # noqa: F401
 from vllm.platforms import current_platform
-from .utils import (generate_data, generate_data_for_expand_nslices,
+from .utils import (assert_close, generate_data,
-                    ref_torch_groupgemm)
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices)
-HIDDEN_SIZES = [1024]
+HIDDEN_SIZES = [1024] # [2049]
 BATCHES = [1, 4, 16, 32]
 NUM_LORA = [1, 8, 32, 128]
@@ -26,26 +28,9 @@ DTYPES = [torch.float16, torch.bfloat16]
 MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
 SCALES = [0.5]
 SEED = [0]
-CUDA_DEVICES = [f"cuda:{0}"]
+DEVICES = [f"cuda:{0}"]
-def assert_close(a, b):
-    rtol, atol = {
-        torch.float16: (6e-2, 6e-2),
-        torch.bfloat16: (6e-2, 6e-2),
-        torch.float32: (1e-2, 1e-2),
-    }[a.dtype]
-    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-# Unlike test_punica_sizes.py, we directly utilize custom op for
+_dict_lock = Lock()
-# testing, which verifies the correct registration of these ops.
-bgmv_expand = torch.ops.vllm.bgmv_expand
-bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
-bgmv_shrink = torch.ops.vllm.bgmv_shrink
-sgmv_expand = torch.ops.vllm.sgmv_expand
-sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
-sgmv_shrink = torch.ops.vllm.sgmv_shrink
 @pytest.mark.parametrize("batches", BATCHES)
@@ -53,16 +38,18 @@ sgmv_shrink = torch.ops.vllm.sgmv_shrink
 @pytest.mark.parametrize("rank", MAX_RANKS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("scaling", SCALES)
+@pytest.mark.parametrize("nslices", [1, 2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_sgmv(
    batches: int,
    num_loras: int,
    rank: int,
    hidden_size: int,
    scaling: float,
+    nslices: int,
    dtype: torch.dtype,
    op_type: str,
    seed: int,
@@ -74,19 +61,20 @@ def test_punica_sgmv(
    seq_length = 128
    (
        inputs_tensor,
-        lora_weights,
+        lora_weights_lst,
        our_out_tensor,
        ref_out_tensor,
        b_seq_start_loc,
        lora_indices_tensor,
        seq_len_tensor,
        indices,
-    ) = generate_data(
+    ) = generate_data_for_nslices(
        batches,
        hidden_size,
        num_loras,
        rank,
        seq_length,
+        nslices,
        dtype,
        op_type,
        device,
@@ -98,43 +86,85 @@ def test_punica_sgmv(
    else:
        max_seq_length = max_seq_length.item()
    if op_type == "shrink":
-        sgmv_shrink(
+        # Preventing cache error pointer.
-            inputs_tensor,
+        with _dict_lock:
-            lora_weights,
+            _LORA_A_PTR_DICT.clear()
-            our_out_tensor,
+            torch.ops.vllm.sgmv_shrink(
-            b_seq_start_loc,
+                inputs_tensor,
-            seq_len_tensor,
+                lora_weights_lst,
-            lora_indices_tensor,
+                our_out_tensor,
-            batches,
+                b_seq_start_loc,
-            max_seq_length,
+                seq_len_tensor,
-            token_nums,
+                lora_indices_tensor,
-            scaling,
+                batches,
-        )
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
+        for index in range(nslices):
+            sgmv_shrink(
+                inputs_tensor,
+                lora_weights_lst[index],
+                ref_out_tensor[index],
+                b_seq_start_loc,
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                scaling,
+            )
    else:
-        sgmv_expand(
+        with _dict_lock:
-            inputs_tensor,
+            _LORA_B_PTR_DICT.clear()
-            lora_weights,
+            torch.ops.vllm.sgmv_expand(
-            our_out_tensor,
+                inputs_tensor,
-            b_seq_start_loc,
+                lora_weights_lst,
-            seq_len_tensor,
+                our_out_tensor,
-            lora_indices_tensor,
+                b_seq_start_loc,
-            batches,
+                seq_len_tensor,
-            max_seq_length,
+                lora_indices_tensor,
-            token_nums,
+                batches,
-            add_inputs=True,
+                max_seq_length,
-        )
+                token_nums,
-    ref_torch_groupgemm(
+                offset_start=0,
-        ref_out_tensor,
+                add_inputs=True,
-        inputs_tensor,
+            )
-        lora_weights,
+        slice_offset = 0
-        lora_indices_tensor,
+        if nslices == 1:
-        seq_len_tensor,
+            # Verify the torch's sgmv_expand op
-        batches,
+            sgmv_expand(
-        scaling if op_type == "shrink" else 1.0,
+                inputs_tensor[0],
-        op_type,
+                lora_weights_lst[0],
-    )
+                ref_out_tensor,
-    if op_type == "shrink":
+                b_seq_start_loc,
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
+                seq_len_tensor,
+                lora_indices_tensor,
+                batches,
+                max_seq_length,
+                token_nums,
+                add_inputs=True,
+            )
+        else:
+            for index in range(nslices):
+                lora_weights = lora_weights_lst[index]
+                sgmv_expand_slice(
+                    inputs_tensor[index],
+                    lora_weights,
+                    ref_out_tensor,
+                    b_seq_start_loc,
+                    seq_len_tensor,
+                    lora_indices_tensor,
+                    batches,
+                    max_seq_length,
+                    token_nums,
+                    slice_offset,
+                    hidden_size,
+                    add_inputs=True,
+                )
+                slice_offset += hidden_size
    assert_close(our_out_tensor, ref_out_tensor)
@@ -146,7 +176,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_punica_bgmv(
    batches: int,
    num_loras: int,
@@ -158,7 +188,6 @@ def test_punica_bgmv(
    seed: int,
    device: str,
 ):
    torch.set_default_device(device)
    current_platform.seed_everything(seed)
@@ -183,32 +212,38 @@ def test_punica_bgmv(
        device,
    )
    if op_type == "shrink":
-        bgmv_shrink(
+        torch.ops.vllm.bgmv_shrink(
            inputs_tensor,
            lora_weights,
            our_out_tensor,
            indices,
            scaling,
        )
-    else:
-        bgmv_expand(
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            ref_out_tensor,
+            indices,
+            scaling,
+        )
+    else:
+        torch.ops.vllm.bgmv_expand(
            inputs_tensor,
            lora_weights,
            our_out_tensor,
            indices,
            add_inputs=True,
        )
-    ref_torch_groupgemm(
+        bgmv_expand(
-        ref_out_tensor,
+            inputs_tensor,
-        inputs_tensor,
+            lora_weights,
-        lora_weights,
+            ref_out_tensor,
-        lora_indices_tensor,
+            indices,
-        seq_len_tensor,
+            add_inputs=True,
-        batches,
+        )
-        scaling if op_type == "shrink" else 1.0,
-        op_type,
-    )
    if op_type == "shrink":
        ref_out_tensor = ref_out_tensor.to(torch.float32)
    assert_close(our_out_tensor, ref_out_tensor)
@@ -220,24 +255,22 @@ def test_punica_bgmv(
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("nslices", [2, 3])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
 @pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
-def test_punica_expand_nslices(
+def test_punica_bgmv_expand_nslices(
    batches: int,
    num_loras: int,
    rank: int,
    hidden_size: int,
    nslices: int,
    dtype: torch.dtype,
-    op_type: str,
    seed: int,
    device: str,
 ):
    torch.set_default_device(device)
    current_platform.seed_everything(seed)
-    seq_length = 128 if op_type == "sgmv" else 1
+    seq_length = 1
    (
        inputs_tensor,
        lora_weights_lst,
@@ -257,49 +290,26 @@ def test_punica_expand_nslices(
        nslices,
        device,
    )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
    slice_offset = 0
    for index in range(nslices):
        lora_weights = lora_weights_lst[index]
-        if op_type == "sgmv":
+        torch.ops.vllm.bgmv_expand_slice(
-            sgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                slice_offset,
-                hidden_size,
-                add_inputs=True,
-            )
-        else:
-            bgmv_expand_slice(
-                inputs_tensor,
-                lora_weights,
-                our_outputs,
-                indices,
-                slice_offset,
-                slice_size=hidden_size,
-                add_inputs=True,
-            )
-        ref_torch_groupgemm(
-            ref_outputs[:, slice_offset:slice_offset + hidden_size],
            inputs_tensor,
            lora_weights,
-            lora_indices_tensor,
+            our_outputs,
-            seq_len_tensor,
+            indices,
-            batches,
+            slice_offset,
-            1.0,
+            slice_size=hidden_size,
-            op_type="expand",
+            add_inputs=True,
+        )
+        bgmv_expand_slice(
+            inputs_tensor,
+            lora_weights,
+            ref_outputs,
+            indices,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=True,
        )
        slice_offset += hidden_size

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -75,7 +75,8 @@ def do_sample(llm: vllm.LLM,
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
                          tp_size):
-    if num_gpus_available < tp_size:
+    if num_gpus_available < tp_size and \
+        tp_size > 1 and current_platform.is_cuda_alike():
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
    llm = vllm.LLM(

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -7,7 +7,7 @@ from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
-MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
+MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 PROMPT_TEMPLATE = (
    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
@@ -49,16 +49,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    # Print the outputs.
    generated_texts: List[str] = []
    for output in outputs:
-        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Generated text: {generated_text!r}")
    return generated_texts
-@pytest.mark.xfail(current_platform.is_rocm(),
+@pytest.mark.xfail(
-                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
+    current_platform.is_rocm(),
-                   )
+    reason="Qwen2-VL dependency xformers incompatible with ROCm")
 def test_qwen2vl_lora(qwen2vl_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,

--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -18,11 +18,13 @@ class DummyLoRAManager:
    def get_module_lora(self, module_name: str) -> LoRALayerWeights:
        return self._loras[module_name]
-    def init_random_lora(self,
+    def init_random_lora(
-                         module_name: str,
+        self,
-                         weight: torch.Tensor,
+        module_name: str,
-                         rank: int = 8,
+        weight: torch.Tensor,
-                         generate_embeddings_tensor: int = 0):
+        rank: int = 8,
+        generate_embeddings_tensor: int = 0,
+    ):
        lora = LoRALayerWeights(
            module_name,
            rank=rank,
@@ -35,21 +37,25 @@ class DummyLoRAManager:
                              device=self._device),
        )
        if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(5,
+            lora.embeddings_tensor = torch.rand(
-                                                generate_embeddings_tensor,
+                5,
-                                                dtype=weight.dtype,
+                generate_embeddings_tensor,
-                                                device=self._device)
+                dtype=weight.dtype,
+                device=self._device,
+            )
        self.set_module_lora(module_name, lora)
        return lora
-    def init_lora(self,
+    def init_lora(
-                  module_name: str,
+        self,
-                  input_dim: int,
+        module_name: str,
-                  output_dim: int,
+        input_dim: int,
-                  rank=8,
+        output_dim: int,
-                  noop=False,
+        rank=8,
-                  embeddings_tensor=None):
+        noop=False,
+        embeddings_tensor=None,
+    ):
        lora = LoRALayerWeights(
            module_name,
            rank=rank,
@@ -98,35 +104,16 @@ def assert_close(a, b):
    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
-def ref_torch_groupgemm(
+def generate_data(
-    out_tensor,
-    inputs,
-    lora_weights,
-    lora_indices_tensor,
-    seq_len_tensor,
    batches,
-    scaling,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
    op_type,
-) -> torch.Tensor:
+    device,
-    out_list = []
+):
-    current_offset = 0
-    for lora_index, b_length in zip(range(batches), seq_len_tensor):
-        input_weight = inputs[current_offset:b_length + current_offset, :]
-        current_offset += b_length
-        lora_weight = lora_weights[lora_indices_tensor[lora_index]]
-        result = torch.nn.functional.linear(input_weight, lora_weight)
-        result *= scaling
-        out_list.append(result)
-    cat_result = torch.cat(out_list, dim=0)
-    if op_type == "expand":
-        out_tensor += cat_result
-    else:
-        out_tensor.copy_(cat_result)
-    return
-def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
-                  op_type, device):
    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                   (batches, )).to(device)
    b_seq_start_loc = torch.cumsum(
@@ -187,8 +174,16 @@ def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
    )
-def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
+def generate_data_for_expand_nslices(
-                                     seq_length, dtype, nslices, device):
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    dtype,
+    nslices,
+    device,
+):
    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                   (batches, )).to(device)
    b_seq_start_loc = torch.cumsum(
@@ -221,7 +216,87 @@ def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
    for b_id in range(batches):
        lora_index = lora_indices_tensor[b_id]
        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = lora_index.item()
+                seq_len_tensor[b_id]] = (lora_index.item())
+        current_offset += seq_len_tensor[b_id].item()
+    lora_indices_tensor = lora_indices_tensor.to(device)
+    return (
+        inputs_tensor,
+        lora_weights_lst,
+        our_out_tensor,
+        ref_out_tensor,
+        b_seq_start_loc,
+        lora_indices_tensor,
+        seq_len_tensor,
+        indices,
+    )
+def generate_data_for_nslices(
+    batches,
+    hidden_size,
+    lora_nums,
+    max_rank,
+    seq_length,
+    nslices,
+    dtype,
+    op_type,
+    device,
+):
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
+                                   (batches, )).to(device)
+    b_seq_start_loc = torch.cumsum(
+        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+        dim=0,
+    ).to(device)
+    total_tokens = seq_len_tensor.sum()
+    lora_weights_lst = []
+    if op_type == "shrink":
+        inputs_tensor = torch.rand((total_tokens, hidden_size),
+                                   dtype=dtype).to(device)
+        for _ in range(nslices):
+            if op_type == "shrink":
+                lora_weights_lst.append(
+                    torch.rand(
+                        (lora_nums, max_rank, hidden_size),  # col-major
+                        dtype=dtype,
+                    ).to(device))
+        # NOTE  shrink kernel using torch.float32 as output type
+        # shrink op need atomic_add, so output is initinized by 0
+        our_out_tensor = torch.zeros(
+            (nslices, total_tokens, max_rank),
+            dtype=torch.float32,
+        ).to(device)
+    else:
+        inputs_tensor = torch.rand(
+            (nslices, total_tokens, max_rank),
+            dtype=dtype,
+        ).to(device)
+        for _ in range(nslices):
+            lora_weights_lst.append(
+                torch.rand(
+                    (lora_nums, hidden_size, max_rank),  # col-major
+                    dtype=dtype,
+                ).to(device))
+        # expand op needs to complete y+=a@lora_b, so output is
+        # initinized randomly
+        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
+                                    dtype=dtype).to(device)
+    # Ensure the same input.
+    ref_out_tensor = our_out_tensor.clone()
+    lora_indices_tensor = torch.randint(0,
+                                        lora_nums - 1 if lora_nums > 1 else 1,
+                                        (batches, ))
+    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
+    current_offset = 0
+    for b_id in range(batches):
+        lora_index = lora_indices_tensor[b_id]
+        indices[current_offset:current_offset +
+                seq_len_tensor[b_id]] = (lora_index.item())
        current_offset += seq_len_tensor[b_id].item()
    lora_indices_tensor = lora_indices_tensor.to(device)

--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -2,7 +2,7 @@ import os
 import pytest
-from vllm.model_executor.layers.pooler import PoolingType
+from vllm.model_executor.layers.pooler import CLSPool, PoolingType
 from vllm.model_executor.models.bert import BertEmbeddingModel
 from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
@@ -26,13 +26,12 @@ def test_model_loading_with_params(vllm_runner):
    with vllm_runner(model_name=MODEL_NAME,
                     revision=REVISION,
                     dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as model:
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
-        output = model.encode("Write a short story about a robot that"
+        output = vllm_model.encode("Write a short story about a robot that"
-                              " dreams for the first time.\n")
+                                   " dreams for the first time.\n")
-        model_config = model.model.llm_engine.model_config
+        model_config = vllm_model.model.llm_engine.model_config
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
-        model_tokenizer = model.model.llm_engine.tokenizer
        # asserts on the bert model config file
        assert model_config.encoder_config["max_seq_length"] == 512
@@ -47,11 +46,13 @@ def test_model_loading_with_params(vllm_runner):
        assert model_tokenizer.tokenizer_config["do_lower_case"]
        assert model_tokenizer.tokenizer.model_max_length == 512
-        model = model.model.llm_engine.model_executor\
+        def check_model(model):
-                     .driver_worker.model_runner.model
+            assert isinstance(model, BertEmbeddingModel)
-        assert isinstance(model, BertEmbeddingModel)
+            assert model._pooler.pooling_type == PoolingType.CLS
-        assert model._pooler.pooling_type == PoolingType.CLS
+            assert model._pooler.normalize
-        assert model._pooler.normalize
+        vllm_model.apply_model(check_model)
        # assert output
        assert output
@@ -65,13 +66,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
    with vllm_runner(model_name=MODEL_NAME_ROBERTA,
                     revision=REVISION_ROBERTA,
                     dtype="float16",
-                     max_model_len=MAX_MODEL_LEN) as model:
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
-        output = model.encode("Write a short story about a robot that"
+        output = vllm_model.encode("Write a short story about a robot that"
-                              " dreams for the first time.\n")
+                                   " dreams for the first time.\n")
-        model_config = model.model.llm_engine.model_config
+        model_config = vllm_model.model.llm_engine.model_config
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
-        model_tokenizer = model.model.llm_engine.tokenizer
        # asserts on the bert model config file
        assert model_config.encoder_config["max_seq_length"] == 512
@@ -85,11 +85,38 @@ def test_roberta_model_loading_with_params(vllm_runner):
        assert model_tokenizer.tokenizer_id == os.path.join(models_path_prefix, "intfloat/multilingual-e5-large")
        assert not model_tokenizer.tokenizer_config["do_lower_case"]
-        model = model.model.llm_engine.model_executor\
+        def check_model(model):
-                     .driver_worker.model_runner.model
+            assert isinstance(model, RobertaEmbeddingModel)
-        assert isinstance(model, RobertaEmbeddingModel)
+            assert model._pooler.pooling_type == PoolingType.MEAN
-        assert model._pooler.pooling_type == PoolingType.MEAN
+            assert model._pooler.normalize
-        assert model._pooler.normalize
+        vllm_model.apply_model(check_model)
        # assert output
        assert output
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_facebook_roberta_model_loading_with_params(vllm_runner):
+    """
+    Test loading roberta-base model with no lm_head.
+    """
+    model_name = "FacebookAI/roberta-base"
+    with vllm_runner(model_name=model_name,
+                     dtype="float16",
+                     max_model_len=MAX_MODEL_LEN) as vllm_model:
+        output = vllm_model.encode("Write a short story about a robot that"
+                                   " dreams for the first time.\n")
+        model_tokenizer = vllm_model.model.llm_engine.tokenizer
+        assert model_tokenizer.tokenizer_id == model_name
+        def check_model(model):
+            assert isinstance(model, RobertaEmbeddingModel)
+            assert not hasattr(model, "lm_head")
+            assert isinstance(model._pooler, CLSPool)
+        vllm_model.apply_model(check_model)
+        assert output
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -240,8 +240,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
 @pytest.mark.asyncio
-async def test_online_inference(client, audio_assets):
+async def test_online_serving(client, audio_assets):
-    """Exercises online inference with/without chunked prefill enabled."""
+    """Exercises online serving with/without chunked prefill enabled."""
    messages = [{
        "role":

--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -20,18 +20,17 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize(
-    "kv_cache_dtype,base_model,test_model,scale_path",
+    "kv_cache_dtype,base_model,test_model",
    [
        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
        ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-          os.path.join(models_path_prefix, "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"), None),
+         os.path.join(models_path_prefix, "nm-testing/Llama-3.2-1B-Instruct-FP8-KV")),
        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
        ("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-         os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), None),
+         os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")),
        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
        ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
-         os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
+         os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"))
-         "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
@@ -49,7 +48,6 @@ def test_models(
    kv_cache_dtype: str,
    base_model: str,
    test_model: str,
-    scale_path: Optional[str],
    max_tokens: int,
    enforce_eager: bool,
    backend: str,
@@ -77,10 +75,6 @@ def test_models(
        baseline_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, NUM_LOG_PROBS)
-    extra_kwargs = {}
-    if scale_path is not None:
-        extra_kwargs["quantization_param_path"] = scale_path
    with vllm_runner(
            test_model,
            max_model_len=MAX_MODEL_LEN,
@@ -88,7 +82,6 @@ def test_models(
            enforce_eager=enforce_eager,
            kv_cache_dtype=kv_cache_dtype,
            disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
    ) as vllm_model:
        test_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, NUM_LOG_PROBS)

--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -4,6 +4,7 @@ Note: To pass the test, quantization higher than Q4 should be used
 """
 import os
+from typing import List, NamedTuple, Type
 import pytest
 from huggingface_hub import hf_hub_download
@@ -11,6 +12,7 @@ from transformers import AutoTokenizer
 from tests.quantization.utils import is_quant_method_supported
+from ....conftest import VllmRunner
 from ...utils import check_logprobs_close
 from ....utils import models_path_prefix
@@ -19,31 +21,78 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024
+class GGUFTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_filename: str
+    @property
+    def gguf_model(self):
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
+LLAMA_CONFIG = GGUFTestConfig(
+    original_model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
+    gguf_repo=os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
+    gguf_filename=os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
+)
+QWEN2_CONFIG = GGUFTestConfig(
+    original_model=os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"),
+    gguf_repo=os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct-GGUF"),
+    gguf_filename=os.path.join(models_path_prefix, "qwen2.5-1.5b-instruct-q6_k.gguf"),
+)
+PHI3_CONFIG = GGUFTestConfig(
+    original_model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-mini-instruct"),
+    gguf_repo=os.path.join(models_path_prefix, "bartowski/Phi-3.5-mini-instruct-GGUF"),
+    gguf_filename=os.path.join(models_path_prefix, "Phi-3.5-mini-instruct-IQ4_XS.gguf"),
+)
+GPT2_CONFIG = GGUFTestConfig(
+    original_model=os.path.join(models_path_prefix, "openai-community/gpt2-large"),
+    gguf_repo=os.path.join(models_path_prefix, "QuantFactory/gpt2-large-GGUF"),
+    gguf_filename=os.path.join(models_path_prefix, "gpt2-large.Q4_K_M.gguf"),
+)
+STABLELM_CONFIG = GGUFTestConfig(
+    original_model=os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
+    gguf_repo=os.path.join(models_path_prefix, "afrideva/stablelm-3b-4e1t-GGUF"),
+    gguf_filename=os.path.join(models_path_prefix, "stablelm-3b-4e1t.q4_k_m.gguf"),
+)
+STARCODER_CONFIG = GGUFTestConfig(
+    original_model=os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),
+    gguf_repo=os.path.join(models_path_prefix, "QuantFactory/starcoder2-3b-GGUF"),
+    gguf_filename=os.path.join(models_path_prefix, "starcoder2-3b.Q6_K.gguf"),
+)
+DOLPHIN_CONFIG = GGUFTestConfig(
+    # Test VocabParallelEmbedding sharding issue.
+    original_model=os.path.join(models_path_prefix, "cognitivecomputations/TinyDolphin-2.8-1.1b"),
+    gguf_repo=os.path.join(models_path_prefix, "tsunemoto/TinyDolphin-2.8-1.1b-GGUF"),
+    gguf_filename=os.path.join(models_path_prefix, "tinydolphin-2.8-1.1b.Q6_K.gguf"),
+)
+MODELS = [
+    LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG,
+    DOLPHIN_CONFIG
+    # STARCODER_CONFIG, # broken
+]
 @pytest.mark.skipif(not is_quant_method_supported("gguf"),
                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
+@pytest.mark.parametrize("model", MODELS)
-    (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-     os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
-     os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
-    (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-     os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
-     os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
-    (os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GGUF"),
-     os.path.join(models_path_prefix, "qwen2-1_5b-instruct-q4_k_m.gguf")),
-    (os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), os.path.join(models_path_prefix, "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF"),
-     os.path.join(models_path_prefix, "Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
-])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_models(
-    num_gpus_available,
+    num_gpus_available: int,
-    vllm_runner,
+    vllm_runner: Type[VllmRunner],
-    example_prompts,
+    example_prompts: List[str],
-    original_model,
+    model: GGUFTestConfig,
-    gguf_id,
-    gguf_path,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
@@ -52,28 +101,29 @@ def test_models(
    if num_gpus_available < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-    gguf_model = hf_hub_download(gguf_id, filename=gguf_path)
+    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
+    if tokenizer.chat_template is not None:
-    tokenizer = AutoTokenizer.from_pretrained(original_model)
+        messages = [[{
-    messages = [[{
+            'role': 'user',
-        'role': 'user',
+            'content': prompt
-        'content': prompt
+        }] for prompt in example_prompts]
-    }] for prompt in example_prompts]
+        example_prompts = tokenizer.apply_chat_template(
-    example_prompts = tokenizer.apply_chat_template(messages,
+            messages, tokenize=False, add_generation_prompt=True)
-                                                    tokenize=False,
-                                                    add_generation_prompt=True)
    # Run unquantized model.
-    with vllm_runner(model_name=original_model,
+    with vllm_runner(
-                     dtype=dtype,
+            model_name=model.original_model,
-                     max_model_len=MAX_MODEL_LEN,
+            enforce_eager=True,  # faster tests
-                     tensor_parallel_size=tp_size) as original_model:
+            dtype=dtype,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tp_size) as original_model:
        original_outputs = original_model.generate_greedy_logprobs(
            example_prompts[:-1], max_tokens, num_logprobs)
    # Run gguf model.
-    with vllm_runner(model_name=gguf_model,
+    with vllm_runner(model_name=model.gguf_model,
+                     enforce_eager=True,
+                     tokenizer_name=model.original_model,
                     dtype=dtype,
                     max_model_len=MAX_MODEL_LEN,
                     tensor_parallel_size=tp_size) as gguf_model: