Merge tag 'v0.7.3' into v0.7.3-dev

ec5e299c · zhuwenwen · 47bd229c · ed6e9075 · ec5e299c · ec5e299c
Commit ec5e299c authored Feb 21, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
 # SPDX-License-Identifier: Apache-2.0

 from itertools import accumulate, product
-from typing import Dict, List, Optional
+from typing import Callable, Dict, List, Optional

 import pytest
 import torch
@@ -24,7 +24,21 @@ CUDA_DEVICES = [
 ]


+def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
+                           head_size: int) -> tuple[int, ...]:
+    return (batch_size, seq_len, num_heads * head_size)
+
+
+def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
+                            head_size: int) -> tuple[int, ...]:
+    return (batch_size, seq_len, num_heads, head_size)
+
+
+TENSORS_SHAPES_FN = [_get_batch_tensor_shape, _get_flat_tensor_shape]
+
+
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -36,6 +50,7 @@ CUDA_DEVICES = [
 @torch.inference_mode()
 def test_rotary_embedding(
    is_neox_style: bool,
+    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
    batch_size: int,
    seq_len: int,
    num_heads: int,
@@ -55,13 +70,11 @@ def test_rotary_embedding(
    if rotary_dim is None:
        rotary_dim = head_size
    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
-    rope = rope.to(dtype=dtype)
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())

    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
+    query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
+    query = torch.randn(query_shape, dtype=dtype)
    key = torch.randn_like(query)

    # NOTE(woosuk): The reference implementation should be executed first
@@ -80,6 +93,7 @@ def test_rotary_embedding(
    

 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -91,6 +105,7 @@ def test_rotary_embedding(
 @torch.inference_mode()
 def test_batched_rotary_embedding(
    is_neox_style: bool,
+    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
    batch_size: int,
    seq_len: int,
    num_heads: int,
@@ -110,13 +125,11 @@ def test_batched_rotary_embedding(
        "rope_type": "linear",
        "factor": (1, )
    })
-    rope = rope.to(dtype=dtype)
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())

    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
+    query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
+    query = torch.randn(query_shape, dtype=dtype)
    key = torch.randn_like(query)

    # NOTE(woosuk): The reference implementation should be executed first
@@ -171,7 +184,7 @@ def test_batched_rotary_embedding_multi_lora(
        "rope_type": "linear",
        "factor": tuple(scaling_factors)
    })
-    rope = rope.to(dtype=dtype)
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())

    positions = torch.randint(0, max_position, (batch_size, seq_len))
    query = torch.randn(batch_size,

--- a/tests/kernels/test_rotary_embedding.py
+++ b/tests/kernels/test_rotary_embedding.py
@@ -41,7 +41,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
                                  is_neox_style, rotary_dim, head_size,
                                  seq_len):
    batch_size = 1
-    base = 0
+    base = 10000
    num_heads = 7
    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
                          is_neox_style, torch.float32)

--- a/tests/kernels/untest_flashinfer.py
+++ b/tests/kernels/untest_flashinfer.py
@@ -273,6 +273,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
        seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int],
        head_size: int, dtype: torch.dtype, block_size: int,
        soft_cap: Optional[float]) -> None:
+    pytest.skip("TODO: fix the accuracy issue")
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
    num_seqs = len(seq_lens)
@@ -384,6 +385,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
    block_size: int,
    soft_cap: Optional[float],
 ) -> None:
+    pytest.skip("TODO: fix the accuracy issue")
    # test doesn't work for num_heads = (16,16)
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)

--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -30,7 +30,7 @@ def setup_servers():
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--model",
-        os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
+        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
        "--port",
        "8100",
        "--gpu-memory-utilization",
@@ -51,7 +51,7 @@ def setup_servers():
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--model",
-        os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
+        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
        "--port",
        "8200",
        "--gpu-memory-utilization",
@@ -102,8 +102,7 @@ def test_disaggregated_prefilling(prompt):
    response = requests.post("http://localhost:8100/v1/completions",
                             headers={"Content-Type": "application/json"},
                             json={
-                                 "model":
-                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
                                 "prompt": prompt,
                                 "max_tokens": 1,
                                 "temperature": 0
@@ -114,8 +113,7 @@ def test_disaggregated_prefilling(prompt):
    response = requests.post("http://localhost:8200/v1/completions",
                             headers={"Content-Type": "application/json"},
                             json={
-                                 "model":
-                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
                                 "prompt": prompt,
                                 "max_tokens": 10,
                                 "temperature": 0

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -245,6 +245,11 @@ def qwen2vl_lora_files():
    return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")


+@pytest.fixture(scope="session")
+def qwen25vl_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
+
+
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
    # return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
@@ -324,3 +329,20 @@ def llama_2_7b_engine_extra_embeddings():
 def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
           model_runner.model)
+
+
+@pytest.fixture(params=[True, False])
+def run_with_both_engines_lora(request, monkeypatch):
+    # Automatically runs tests twice, once with V1 and once without
+    use_v1 = request.param
+    # Tests decorated with `@skip_v1` are only run without v1
+    skip_v1 = request.node.get_closest_marker("skip_v1")
+
+    if use_v1:
+        if skip_v1:
+            pytest.skip("Skipping test on vllm V1")
+        monkeypatch.setenv('VLLM_USE_V1', '1')
+    else:
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+    yield
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import time
+from pathlib import Path
+from typing import List
+
+import pytest
+from huggingface_hub import snapshot_download
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.utils import merge_async_iterators
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+LORA_MODULE_DOWNLOAD_PATH = None  # Populated by download_and_prepare_lora_module() #noqa
+LORA_RANK = 8
+DEFAULT_MAX_LORAS = 16 * 3
+
+
+def download_and_prepare_lora_module():
+    """
+    Request submission is expensive when the LoRA adapters have their own
+    tokenizers. This is because, for each request with a new LoRA adapter ID,
+    the front-end loads the tokenizer from disk.
+
+    In this test, as we are comparing request processing times, we want to
+    minimize any extra activity. To this effect, we download the LoRA
+    adapter and remove all the tokenizer files, so the engine will default
+    to the base model tokenizer.
+    """
+    global LORA_MODULE_DOWNLOAD_PATH
+
+    LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
+    LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
+
+    tokenizer_files = [
+        'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
+        'tokenizer.model'
+    ]
+    for tokenizer_file in tokenizer_files:
+        del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
+        del_path.unlink(missing_ok=True)
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+def get_lora_requests() -> List[LoRARequest]:
+    lora_requests: List[LoRARequest] = [
+        LoRARequest(lora_name=f"{i}",
+                    lora_int_id=i,
+                    lora_path=LORA_MODULE_DOWNLOAD_PATH)
+        for i in range(1, DEFAULT_MAX_LORAS + 1)
+    ]
+    return lora_requests
+
+
+async def requests_processing_time(llm,
+                                   lora_requests: List[LoRARequest]) -> float:
+
+    sampling_params = SamplingParams(n=1,
+                                     temperature=0.0,
+                                     top_p=1.0,
+                                     ignore_eos=True,
+                                     max_tokens=1)
+
+    generators = []
+    start = time.perf_counter()
+
+    for lora_request in lora_requests:
+        lora_int_id = lora_request.lora_int_id
+        generator = llm.generate(
+            prompt=TextPrompt(prompt=f"hello {lora_int_id}",
+                              multi_modal_data=None),  # type: ignore 
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+            request_id=f"test{lora_int_id}")
+        generators.append(generator)
+
+    all_gens = merge_async_iterators(*generators)
+    async for i, res in all_gens:
+        pass
+
+    end = time.perf_counter()
+    return end - start
+
+
+@pytest.mark.asyncio
+async def test_add_lora():
+    """ 
+    The add_lora function is used to pre-load some LoRA adapters into the
+    engine in anticipation of future requests using these adapters. To test
+    this functionality, we use the async engine to process some requests - We
+    do it twice, once with add_lora() pre-loading and once without.
+
+    We measure the request processing time in both cases and expect the time 
+    to be lesser in the case with add_lora() calls.
+    """
+
+    download_and_prepare_lora_module()
+
+    lora_requests: List[LoRARequest] = get_lora_requests()
+
+    max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
+    # Create engine in eager-mode. Due to high max_loras, the CI can
+    # OOM during cuda-graph capture.
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,  #avoid OOM
+        enforce_eager=True)
+
+    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
+    # environment variable. reload vllm.enging.async_llm_engine as
+    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
+    # env var.
+    import importlib
+
+    import vllm.engine.async_llm_engine
+    importlib.reload(vllm.engine.async_llm_engine)
+    from vllm.entrypoints.openai.api_server import (
+        build_async_engine_client_from_engine_args)
+
+    # split lora_requests into 3 parts
+    part_size = len(lora_requests) // 3
+    dummy_run_requests = lora_requests[:part_size]
+    warmup_run_requests = lora_requests[part_size:part_size * 2]
+    cold_run_requests = lora_requests[part_size * 2:]
+
+    async with build_async_engine_client_from_engine_args(engine_args) as llm:
+
+        # Dummy run - So any 1-time functionality like triton kernel compilation
+        # is complete here.
+        await requests_processing_time(llm, dummy_run_requests)
+
+        # Run with warmup
+        for lr in warmup_run_requests:
+            await llm.add_lora(lr)
+        # Wait for the add_lora function to complete on the server side.
+        await asyncio.sleep(30)
+        time_with_add_lora = await requests_processing_time(
+            llm, warmup_run_requests)
+
+        # Run without any warmup
+        time_cold_start = await requests_processing_time(
+            llm, cold_run_requests)
+
+    print(f"time hot-start {time_with_add_lora} vs "
+          f"time cold-start {time_cold_start} ")
+
+    assert time_with_add_lora < time_cold_start, (
+        f"time_with_add_lora={time_with_add_lora}, "
+        f"time_cold_start={time_cold_start}"
+        "The engine request processing time with LoRA pre-loading "
+        "must be less than the version that does on-demand LoRA loading.")
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -45,6 +45,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def test_baichuan_lora(baichuan_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,

--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -3,6 +3,8 @@
 from typing import List
 import os

+import pytest
+
 import vllm
 from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
@@ -49,6 +51,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.mark.skip_v1
 @fork_new_process_for_each_test
 def test_chatglm3_lora(chatglm3_lora_files):
    llm = vllm.LLM(MODEL_PATH,
@@ -68,6 +79,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
        assert output2[i] == EXPECTED_LORA_OUTPUT[i]


+@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
@@ -89,6 +101,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
        assert output2[i] == EXPECTED_LORA_OUTPUT[i]


+@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):

--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -36,6 +36,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.xfail(current_platform.is_rocm(),
                   reason="There can be output mismatch on ROCm")
 def test_gemma_lora(gemma_lora_files):

--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -2,6 +2,7 @@

 from typing import List

+import pytest
 import ray

 import vllm
@@ -74,6 +75,14 @@ def generate_and_test(llm, sql_lora_files):
    print("removing lora")


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @fork_new_process_for_each_test
 def test_llama_lora(sql_lora_files):

@@ -86,6 +95,9 @@ def test_llama_lora(sql_lora_files):
    generate_and_test(llm, sql_lora_files)


+# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
+# used by the engine yet.
+@pytest.mark.skip_v1
 @fork_new_process_for_each_test
 def test_llama_lora_warmup(sql_lora_files):
    """Test that the LLM initialization works with a warmup LORA path and

--- a/tests/lora/test_lora_bias_e2e.py
+++ b/tests/lora/test_lora_bias_e2e.py
@@ -32,6 +32,17 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+# Skipping for V1 for now as we are hitting,
+# "Head size 80 is not supported by FlashAttention." error.
+@pytest.mark.skip_v1
 @pytest.mark.parametrize("lora_bias", [True])
 @pytest.mark.parametrize("fully_sharded", [True, False])
 def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool):

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -606,20 +606,26 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):

    assert isinstance(model.get_submodule("gate_up_proj"),
                      MergedColumnParallelLinearWithLoRA)
+    # Verify packed lora is correct
+    model_lora_clone = model_lora.clone(1)
+    model_lora_clone1 = model_lora1.clone(1)
    assert manager.add_adapter(model_lora)
    assert manager.add_adapter(model_lora1)

+    assert model_lora.get_lora("gate_proj") is None
+    assert model_lora.get_lora("up_proj") is None
+    assert model_lora1.get_lora("up_proj") is None
    packed_lora = model_lora.get_lora("gate_up_proj")
    assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)

    torch.testing.assert_close(packed_lora.lora_a[0],
-                               model_lora.get_lora("gate_proj").lora_a)
+                               model_lora_clone.get_lora("gate_proj").lora_a)
    torch.testing.assert_close(packed_lora.lora_b[0],
-                               model_lora.get_lora("gate_proj").lora_b)
+                               model_lora_clone.get_lora("gate_proj").lora_b)
    torch.testing.assert_close(packed_lora.lora_a[1],
-                               model_lora.get_lora("up_proj").lora_a)
+                               model_lora_clone.get_lora("up_proj").lora_a)
    torch.testing.assert_close(packed_lora.lora_b[1],
-                               model_lora.get_lora("up_proj").lora_b)
+                               model_lora_clone.get_lora("up_proj").lora_b)

    packed_lora1 = model_lora1.get_lora("gate_up_proj")
    assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
@@ -627,6 +633,6 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
    assert packed_lora1.lora_a[0] is None
    assert packed_lora1.lora_b[0] is None
    torch.testing.assert_close(packed_lora1.lora_a[1],
-                               model_lora1.get_lora("up_proj").lora_a)
+                               model_lora_clone1.get_lora("up_proj").lora_a)
    torch.testing.assert_close(packed_lora1.lora_b[1],
-                               model_lora1.get_lora("up_proj").lora_b)
+                               model_lora_clone1.get_lora("up_proj").lora_b)
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -3,6 +3,8 @@
 from typing import List
 import os

+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 from ..utils import models_path_prefix
@@ -50,6 +52,17 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+# Skipping for V1 for now as we are hitting,
+# "Head size 80 is not supported by FlashAttention." error.
+@pytest.mark.skip_v1
 def test_phi2_lora(phi2_lora_files):
    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
    # Otherwise, the lora-test will fail due to CUDA OOM.

--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
+# SPDX-License-Identifier: Apache-2.0
+from threading import Lock
+from typing import List
+
+import pytest
+import torch
+
+import vllm.lora.ops.triton_ops  # noqa: F401
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.platforms import current_platform
+
+from .utils import (PunicaTensors, assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices)
+
+
+# Utility shrink and expand operations used as reference implementations.
+def sgmv_shrink_for_nslices(
+        nslices: int, inputs_tensor: torch.Tensor,
+        lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor,
+        b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
+        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
+        num_tokens: int, scaling: float):
+    """
+    Wrapper around sgmv_shrink that handles any nslices.
+    """
+    for index in range(nslices):
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights_lst[index],
+            out_tensor[index],
+            b_seq_start_loc,
+            seq_len_tensor,
+            prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            num_tokens,
+            scaling,
+        )
+
+
+def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
+                            inputs_tensor: torch.Tensor,
+                            lora_weights_lst: List[torch.Tensor],
+                            out_tensor: torch.Tensor,
+                            b_seq_start_loc: torch.Tensor,
+                            seq_len_tensor: torch.Tensor,
+                            prompt_lora_mapping: torch.Tensor, batches: int,
+                            max_seq_length: int, num_tokens: int,
+                            add_inputs: bool) -> None:
+    """
+    Wrapper around sgmv_expand that handles any nslices.
+    """
+    if nslices == 1:
+        # Verify the torch's sgmv_expand op
+        sgmv_expand(
+            inputs_tensor[0],
+            lora_weights_lst[0],
+            out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            num_tokens,
+            add_inputs=add_inputs,
+        )
+    else:
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            sgmv_expand_slice(
+                inputs_tensor[index],
+                lora_weights,
+                out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                prompt_lora_mapping,
+                batches,
+                max_seq_length,
+                num_tokens,
+                slice_offset,
+                hidden_size,
+                add_inputs=add_inputs,
+            )
+            slice_offset += hidden_size
+
+
+_dict_lock = Lock()
+
+
+def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, nslices: int, dtype: torch.dtype,
+                      device: str, seq_length: int, scaling: float):
+    """
+    Compare outputs of vllm.sgmv_shrink kernel against a reference
+    implementation.
+    """
+    data: PunicaTensors = generate_data_for_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        nslices,
+        dtype,
+        "shrink",
+        device,
+    )
+    max_seq_length, token_nums = data.meta()
+
+    # Preventing cache error pointer.
+    with _dict_lock:
+        _LORA_A_PTR_DICT.clear()
+        torch.ops.vllm.sgmv_shrink(
+            data.inputs_tensor,
+            data.lora_weights,
+            data.our_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            scaling,
+        )
+
+        sgmv_shrink_for_nslices(
+            nslices,
+            data.inputs_tensor,
+            data.lora_weights,
+            data.ref_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            scaling,
+        )
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_sgmv_expand(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, nslices: int, dtype: torch.dtype,
+                      device: str, seq_length: int, add_inputs: bool):
+    """
+    Compare outputs of vllm.sgmv_expand kernel against a reference
+    implementation.
+    """
+    data: PunicaTensors = generate_data_for_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        nslices,
+        dtype,
+        "expand",
+        device,
+    )
+
+    max_seq_length, token_nums = data.meta()
+
+    with _dict_lock:
+        _LORA_B_PTR_DICT.clear()
+        torch.ops.vllm.sgmv_expand(
+            data.inputs_tensor,
+            data.lora_weights,
+            data.our_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            offset_start=0,
+            add_inputs=add_inputs,
+        )
+
+    sgmv_expand_for_nslices(nslices,
+                            hidden_size,
+                            data.inputs_tensor,
+                            data.lora_weights,
+                            data.ref_out_tensor,
+                            data.b_seq_start_loc,
+                            data.seq_len_tensor,
+                            data.prompt_lora_mapping,
+                            batches,
+                            max_seq_length,
+                            token_nums,
+                            add_inputs=add_inputs)
+
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, dtype: torch.dtype, device: str,
+                      scaling: float):
+    """
+    Compare vllm.bgmv_shrink against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        "shrink",
+        device,
+    )
+
+    torch.ops.vllm.bgmv_shrink(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.our_out_tensor,
+        data.token_lora_mapping,
+        scaling,
+    )
+
+    bgmv_shrink(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.ref_out_tensor,
+        data.token_lora_mapping,
+        scaling,
+    )
+
+    data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_expand(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, dtype: torch.dtype, device: str,
+                      add_inputs: bool):
+    """
+    Compare vllm.bgmv_expand against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        "expand",
+        device,
+    )
+
+    torch.ops.vllm.bgmv_expand(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.our_out_tensor,
+        data.token_lora_mapping,
+        add_inputs=add_inputs,
+    )
+    bgmv_expand(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.ref_out_tensor,
+        data.token_lora_mapping,
+        add_inputs=add_inputs,
+    )
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
+                            hidden_size: int, nslices: int, dtype: torch.dtype,
+                            device: str, add_inputs: bool):
+    """
+    Compare vllm.bgmv_expand_slice against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+
+    slice_offset = 0
+    for index in range(nslices):
+        torch.ops.vllm.bgmv_expand_slice(
+            data.inputs_tensor,
+            data.lora_weights[index],
+            data.our_out_tensor,
+            data.token_lora_mapping,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=add_inputs,
+        )
+        bgmv_expand_slice(
+            data.inputs_tensor,
+            data.lora_weights[index],
+            data.ref_out_tensor,
+            data.token_lora_mapping,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=add_inputs,
+        )
+
+        slice_offset += hidden_size
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+# Tests
+# We test the punica kernels along 2 verticals mainly.
+# 1. Variations in hidden_dim size
+# 2. Variations in all other parameters like (batch_size, max_rank, num_loras
+#  etc.)
+
+# We have collected the hidden_sizes included in the LoRA models
+# currently supported by vLLM. It tests whether the corresponding Triton
+# kernel can run normally when tensor parallelism is set to
+# [1, 2, 4, 8, 16, 32, 64].
+HIDDEN_SIZES = [
+    128,
+    256,
+    512,
+    896,
+    1024,
+    1152,
+    1216,
+    1280,
+    1536,
+    1664,
+    2048,
+    2240,
+    2304,
+    2368,
+    2432,
+    2560,
+    2752,
+    3072,
+    3328,
+    3456,
+    3584,
+    3712,
+    4096,
+    4480,
+    4608,
+    4736,
+    4864,
+    5120,
+    5504,
+    5632,
+    5888,
+    6144,
+    6400,
+    6848,
+    6912,
+    7168,
+    7424,
+    8192,
+    8960,
+    9216,
+    9472,
+    10240,
+    11008,
+    11264,
+    13824,
+    14336,
+    14784,
+    14848,
+    15360,
+    18944,
+    22016,
+    22528,
+    24576,
+    27392,
+    27648,
+    29568,
+    29696,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    49408,
+    60544,
+    60672,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+#The size of TP
+divisibility = [1, 2, 8, 16, 64]
+
+all_hidden_size = []
+for div in divisibility:
+    for hidden_size in HIDDEN_SIZES:
+        all_hidden_size.append(hidden_size // div)
+
+HIDDEN_SIZES = list(set(all_hidden_size))
+
+# Test params that focuses on hidden_size variation.
+hs_test_params = {
+    "hidden_sizes": HIDDEN_SIZES,
+    "batches": [4],
+    "num_loras": [4],
+    "max_ranks": [32],
+}
+
+# General tests params that tests for variations in all dimensions
+# except hidden_size.
+test_params = {
+    "hidden_sizes": [2049],
+    "batches": [1, 4, 16, 32],
+    "num_loras": [1, 8, 32, 128],
+    "max_ranks": [1, 4, 8, 16, 32, 64, 128, 256],
+}
+
+DTYPES = [torch.float16, torch.bfloat16]
+DEVICES = [f"cuda:{0}"]
+SEED = [0]
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [1, 2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_sgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_sgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          scaling=0.5)
+    else:
+        check_sgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [1, 2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_sgmv_hidden_size(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_sgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          scaling=0.5)
+    else:
+        check_sgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_bgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          scaling=0.5)
+    else:
+        check_bgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_bgmv_hidden_size(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_bgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          scaling=0.5)
+    else:
+        check_bgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
+                                    hidden_size: int, nslices: int,
+                                    dtype: torch.dtype, device: str,
+                                    seed: int):
+
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    check_bgmv_expand_slice(batches=batches,
+                            num_loras=num_loras,
+                            rank=rank,
+                            hidden_size=hidden_size,
+                            nslices=nslices,
+                            dtype=dtype,
+                            device=device,
+                            add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
+                                                rank: int, hidden_size: int,
+                                                nslices: int,
+                                                dtype: torch.dtype,
+                                                device: str, seed: int):
+
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    check_bgmv_expand_slice(batches=batches,
+                            num_loras=num_loras,
+                            rank=rank,
+                            hidden_size=hidden_size,
+                            nslices=nslices,
+                            dtype=dtype,
+                            device=device,
+                            add_inputs=True)
--- a/tests/lora/test_punica_ops_sizes.py
+++ b/tests/lora/test_punica_ops_sizes.py
-# SPDX-License-Identifier: Apache-2.0
-"""
-This script is mainly used to tests various hidden_sizes. We have collected the
-hidden_sizes included in the LoRA models currently supported by vLLM. It tests
-whether the corresponding Triton kernel can run normally when tensor parallelism
-is set to [1, 2, 4, 8, 16, 32, 64].
-"""
-from threading import Lock
-
-import pytest
-import torch
-
-import vllm.lora.ops.triton_ops  # noqa: F401
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
-from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.platforms import current_platform
-
-from .utils import (assert_close, generate_data,
-                    generate_data_for_expand_nslices,
-                    generate_data_for_nslices)
-
-HIDDEN_SIZES = [
-    128,
-    256,
-    512,
-    896,
-    1024,
-    1152,
-    1216,
-    1280,
-    1536,
-    1664,
-    2048,
-    2240,
-    2304,
-    2368,
-    2432,
-    2560,
-    2752,
-    3072,
-    3328,
-    3456,
-    3584,
-    3712,
-    4096,
-    4480,
-    4608,
-    4736,
-    4864,
-    5120,
-    5504,
-    5632,
-    5888,
-    6144,
-    6400,
-    6848,
-    6912,
-    7168,
-    7424,
-    8192,
-    8960,
-    9216,
-    9472,
-    10240,
-    11008,
-    11264,
-    13824,
-    14336,
-    14784,
-    14848,
-    15360,
-    18944,
-    22016,
-    22528,
-    24576,
-    27392,
-    27648,
-    29568,
-    29696,
-    32000,
-    32256,
-    32512,
-    32768,
-    33024,
-    36864,
-    43264,
-    49152,
-    49408,
-    60544,
-    60672,
-    64000,
-    64256,
-    102400,
-    102656,
-    128000,
-    128256,
-]
-#The size of TP
-divisibility = [1, 2, 8, 16, 64]
-
-all_hidden_size = []
-for div in divisibility:
-    for hidden_size in HIDDEN_SIZES:
-        all_hidden_size.append(hidden_size // div)
-
-HIDDEN_SIZES = list(set(all_hidden_size))
-
-BATCHES = [4]
-NUM_LORA = [4]
-DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [32]
-SCALES = [0.5]
-SEED = [0]
-DEVICES = [f"cuda:{0}"]
-
-_dict_lock = Lock()
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("nslices", [1, 2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_sgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    nslices: int,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 128
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        nslices,
-        dtype,
-        op_type,
-        device,
-    )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        # Preventing cache error pointer.
-        with _dict_lock:
-            _LORA_A_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-        for index in range(nslices):
-            sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst[index],
-                ref_out_tensor[index],
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-
-    else:
-        with _dict_lock:
-            _LORA_B_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_expand(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                offset_start=0,
-                add_inputs=True,
-            )
-        if nslices == 1:
-            # Verify the torch's sgmv_expand op
-            sgmv_expand(
-                inputs_tensor[0],
-                lora_weights_lst[0],
-                ref_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                add_inputs=True,
-            )
-        else:
-            slice_offset = 0
-            for index in range(nslices):
-                lora_weights = lora_weights_lst[index]
-                sgmv_expand_slice(
-                    inputs_tensor[index],
-                    lora_weights,
-                    ref_out_tensor,
-                    b_seq_start_loc,
-                    seq_len_tensor,
-                    lora_indices_tensor,
-                    batches,
-                    max_seq_length,
-                    token_nums,
-                    slice_offset,
-                    hidden_size,
-                    add_inputs=True,
-                )
-                slice_offset += hidden_size
-
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        op_type,
-        device,
-    )
-    if op_type == "shrink":
-        torch.ops.vllm.bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            scaling,
-        )
-
-        bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            scaling,
-        )
-
-    else:
-        torch.ops.vllm.bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-        bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv_expand_nslices(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    nslices: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_outputs,
-        ref_outputs,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_expand_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        nslices,
-        device,
-    )
-    slice_offset = 0
-    for index in range(nslices):
-        lora_weights = lora_weights_lst[index]
-        torch.ops.vllm.bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            our_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-        bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            ref_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-
-        slice_offset += hidden_size
-    assert_close(our_outputs, ref_outputs)
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -73,6 +73,14 @@ def do_sample(llm: vllm.LLM,
    return generated_texts


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
 # SPDX-License-Identifier: Apache-2.0
-
-from typing import List
+from dataclasses import dataclass
+from typing import Dict, List, Optional

 import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION

 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform

-MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"

-PROMPT_TEMPLATE = (
-    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
-    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
-    "What is in the image?<|im_end|>\n"
-    "<|im_start|>assistant\n")
+@dataclass
+class TestConfig:
+    model_path: str
+    lora_path: str
+    max_num_seqs: int = 2
+    max_loras: int = 2
+    max_lora_rank: int = 16
+    max_model_len: int = 4096
+    mm_processor_kwargs: Optional[Dict[str, int]] = None
+
+    def __post_init__(self):
+        if self.mm_processor_kwargs is None:
+            self.mm_processor_kwargs = {
+                "min_pixels": 28 * 28,
+                "max_pixels": 1280 * 28 * 28,
+            }
+
+
+class Qwen2VLTester:
+    """Test helper for Qwen2 VL models with LoRA"""
+
+    PROMPT_TEMPLATE = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+        "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+        "What is in the image?<|im_end|>\n"
+        "<|im_start|>assistant\n")
+
+    def __init__(self, config: TestConfig):
+        self.config = config
+        self.llm = self._initialize_llm()
+
+    def _initialize_llm(self) -> vllm.LLM:
+        """Initialize the LLM with given configuration"""
+        return vllm.LLM(
+            model=self.config.model_path,
+            max_num_seqs=self.config.max_num_seqs,
+            enable_lora=True,
+            max_loras=self.config.max_loras,
+            max_lora_rank=self.config.max_lora_rank,
+            trust_remote_code=True,
+            mm_processor_kwargs=self.config.mm_processor_kwargs,
+            max_model_len=self.config.max_model_len,
+        )
+
+    def run_test(self,
+                 images: List[ImageAsset],
+                 expected_outputs: List[str],
+                 lora_id: Optional[int] = None,
+                 temperature: float = 0,
+                 max_tokens: int = 5) -> List[str]:
+
+        sampling_params = vllm.SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        inputs = [{
+            "prompt": self.PROMPT_TEMPLATE,
+            "multi_modal_data": {
+                "image": asset.pil_image
+            },
+        } for asset in images]
+
+        lora_request = LoRARequest(str(lora_id), lora_id,
+                                   self.config.lora_path)
+        outputs = self.llm.generate(inputs,
+                                    sampling_params,
+                                    lora_request=lora_request)
+        generated_texts = [
+            output.outputs[0].text.strip() for output in outputs
+        ]

-IMAGE_ASSETS = [
+        # Validate outputs
+        for generated, expected in zip(generated_texts, expected_outputs):
+            assert expected.startswith(
+                generated), f"Generated text {generated} doesn't "
+            f"match expected pattern {expected}"
+
+        return generated_texts
+
+
+TEST_IMAGES = [
    ImageAsset("stop_sign"),
    ImageAsset("cherry_blossom"),
 ]

-# After fine-tuning with LoRA, all generated content should start begin `A`.
-EXPECTED_OUTPUT = [
+EXPECTED_OUTPUTS = [
    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
 ]

-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    sampling_params = vllm.SamplingParams(
-        temperature=0,
-        max_tokens=5,
-    )
-
-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
-
-    outputs = llm.generate(
-        inputs,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Generated text: {generated_text!r}")
-    return generated_texts
+QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
+QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"


 @pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="Qwen2-VL dependency xformers incompatible with ROCm")
 def test_qwen2vl_lora(qwen2vl_lora_files):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        max_num_seqs=2,
-        enable_lora=True,
-        max_loras=2,
-        max_lora_rank=16,
-        trust_remote_code=True,
-        mm_processor_kwargs={
-            "min_pixels": 28 * 28,
-            "max_pixels": 1280 * 28 * 28,
-        },
-        max_model_len=4096,
-    )
-    output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output1[i])
-
-    output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+    """Test Qwen 2.0 VL model with LoRA"""
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
+                        lora_path=qwen2vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        tester.run_test(TEST_IMAGES,
+                        expected_outputs=EXPECTED_OUTPUTS,
+                        lora_id=lora_id)
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
+)
+@pytest.mark.skipif(
+    Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
+    reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
+)
+def test_qwen25vl_lora(qwen25vl_lora_files):
+    """Test Qwen 2.5 VL model with LoRA"""
+    config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
+                        lora_path=qwen25vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        tester.run_test(TEST_IMAGES,
+                        expected_outputs=EXPECTED_OUTPUTS,
+                        lora_id=lora_id)
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import Dict, List, Optional
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union

 import torch

@@ -106,6 +107,31 @@ def assert_close(a, b):
    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)


+@dataclass
+class PunicaTensors:
+    inputs_tensor: torch.Tensor
+    lora_weights: Union[torch.Tensor, List[torch.Tensor]]
+    our_out_tensor: torch.Tensor
+    ref_out_tensor: torch.Tensor
+    b_seq_start_loc: torch.Tensor
+    prompt_lora_mapping: torch.Tensor
+    seq_len_tensor: torch.Tensor
+    token_lora_mapping: torch.Tensor
+
+    def meta(self) -> Tuple[int, int]:
+        """
+        Infer max_seq_length and token_nums from the tensors
+        and return them.
+        """
+        max_seq_length = self.seq_len_tensor.max()
+        token_nums = self.seq_len_tensor.sum().item()
+        if isinstance(max_seq_length, tuple):
+            max_seq_length = max_seq_length[0].item()
+        else:
+            max_seq_length = max_seq_length.item()
+        return max_seq_length, token_nums
+
+
 def generate_data(
    batches,
    hidden_size,
@@ -115,7 +141,7 @@ def generate_data(
    dtype,
    op_type,
    device,
-):
+) -> PunicaTensors:
    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                   (batches, )).to(device)
    b_seq_start_loc = torch.cumsum(
@@ -164,7 +190,8 @@ def generate_data(
        indices[current_offset:current_offset +
                seq_len_tensor[b_id]].copy_(lora_index)
        current_offset += seq_len_tensor[b_id].item()
-    return (
+
+    return PunicaTensors(
        inputs_tensor,
        lora_weights,
        our_out_tensor,
@@ -185,7 +212,7 @@ def generate_data_for_expand_nslices(
    dtype,
    nslices,
    device,
-):
+) -> PunicaTensors:
    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                   (batches, )).to(device)
    b_seq_start_loc = torch.cumsum(
@@ -222,7 +249,7 @@ def generate_data_for_expand_nslices(
        current_offset += seq_len_tensor[b_id].item()

    lora_indices_tensor = lora_indices_tensor.to(device)
-    return (
+    return PunicaTensors(
        inputs_tensor,
        lora_weights_lst,
        our_out_tensor,
@@ -244,7 +271,7 @@ def generate_data_for_nslices(
    dtype,
    op_type,
    device,
-):
+) -> PunicaTensors:
    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                   (batches, )).to(device)
    b_seq_start_loc = torch.cumsum(
@@ -302,7 +329,7 @@ def generate_data_for_nslices(
        current_offset += seq_len_tensor[b_id].item()

    lora_indices_tensor = lora_indices_tensor.to(device)
-    return (
+    return PunicaTensors(
        inputs_tensor,
        lora_weights_lst,
        our_out_tensor,

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -9,6 +9,7 @@ import ray
 from prometheus_client import REGISTRY

 from vllm import EngineArgs, LLMEngine
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -18,10 +19,14 @@ import vllm.envs as envs

 from ..utils import models_path_prefix

+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+
 MODELS = [
-    os.path.join(models_path_prefix, "facebook/opt-125m"),
+    os.path.join(models_path_prefix, "distilbert/distilgpt2"),
 ]

+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+

 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
@@ -145,8 +150,9 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
        metrics_tag_content = stat_logger.labels["model_name"]

    if served_model_name is None or served_model_name == []:
-        assert metrics_tag_content == model, (
-            f"Metrics tag model_name is wrong! expect: {model!r}\n"
+        actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
+        assert metrics_tag_content == actual_model_name, (
+            f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
            f"actual: {metrics_tag_content!r}")
    else:
        assert metrics_tag_content == served_model_name[0], (
@@ -174,7 +180,8 @@ async def test_async_engine_log_metrics_regression(
    """
    engine_args = AsyncEngineArgs(model=model,
                                  dtype=dtype,
-                                  disable_log_stats=disable_log_stats)
+                                  disable_log_stats=disable_log_stats,
+                                  load_format=RUNAI_STREAMER_LOAD_FORMAT)
    async_engine = AsyncLLMEngine.from_engine_args(engine_args)
    for i, prompt in enumerate(example_prompts):
        results = async_engine.generate(
@@ -203,7 +210,8 @@ def test_engine_log_metrics_regression(
 ) -> None:
    engine_args = EngineArgs(model=model,
                             dtype=dtype,
-                             disable_log_stats=disable_log_stats)
+                             disable_log_stats=disable_log_stats,
+                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
    engine = LLMEngine.from_engine_args(engine_args)
    for i, prompt in enumerate(example_prompts):
        engine.add_request(
@@ -287,7 +295,8 @@ def test_metric_spec_decode_interval(
                             gpu_memory_utilization=0.4,
                             speculative_model=model,
                             num_speculative_tokens=k,
-                             enforce_eager=True)
+                             enforce_eager=True,
+                             load_format=RUNAI_STREAMER_LOAD_FORMAT)

    engine = LLMEngine.from_engine_args(engine_args)


--- a/tests/mistral_tool_use/__init__.py
+++ b/tests/mistral_tool_use/__init__.py