Merge tag 'v0.8.2' into v0.8.2-dev

469e903b · zhuwenwen · 389ebcf7 · 25f560a6 · 469e903b · 469e903b
Commit 469e903b authored Mar 28, 2025 by zhuwenwen
20 changed files
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
 # SPDX-License-Identifier: Apache-2.0

 import ast
-from typing import List, Optional, Tuple
+from typing import Optional

 import numpy as np
 import pytest
@@ -9,7 +9,7 @@ import os

 import vllm
 from vllm import SamplingParams
-from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora
+from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLoRA
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.rotary_embedding import (
    LinearScalingRotaryEmbedding)
@@ -88,7 +88,7 @@ def evaluate_json_response(model_response, golden_response):

 def generate(
    llm: vllm.LLM,
-    inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
+    inputs: tuple[str, SamplingParams, Optional[LoRARequest]],
 ):
    prompts, sampling_param, lora_request = inputs
    outputs = llm.generate(prompts, sampling_param, lora_request=lora_request)
@@ -97,7 +97,7 @@ def generate(

 def batched_generate(
    llm: vllm.LLM,
-    inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]],
+    inputs: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
 ):
    for input in inputs:
        prompt, sampling_param, lora_req = input
@@ -153,7 +153,7 @@ def test_rotary_emb_replaced(dist_init):
        if "rotary_emb" in module_name:
            if "base_layer" not in module_name:
                rotary_emb_count += 1
-                assert isinstance(module, LinearScalingRotaryEmbeddingWithLora)
+                assert isinstance(module, LinearScalingRotaryEmbeddingWithLoRA)
            else:
                assert isinstance(module, LinearScalingRotaryEmbedding)
    # Llama 2 has 32 layers.
@@ -166,7 +166,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
        non-batched generation.
    """
    # Create non batched results first to compare against batched results
-    non_batched_results: List[str] = []
+    non_batched_results: list[str] = []

    for lora_id, info in long_context_infos.items():
        context_len = info["context_length"]
@@ -179,7 +179,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
    # Create batched results
    # Each element of the batch must be
    # (prompt, prompt_sampling_params, prompt_lora_request)
-    batched_prompts: List[Tuple[str, SamplingParams,
+    batched_prompts: list[tuple[str, SamplingParams,
                                Optional[LoRARequest]]] = []
    for lora_id, info in long_context_infos.items():
        context_len = info["context_length"]
@@ -204,7 +204,7 @@ def test_self_consistency(lora_llm, long_context_infos):
    num_loras = len(long_context_infos)

    # Create results in order of long_context_infos
-    batched_prompts: List[Tuple[str, SamplingParams,
+    batched_prompts: list[tuple[str, SamplingParams,
                                Optional[LoRARequest]]] = []
    for lora_id, info in long_context_infos.items():
        context_len = info["context_length"]
@@ -253,7 +253,7 @@ def test_quality(lora_llm, long_context_infos):
    The test is expected to run for about 1 minute on a p4de.24xlarge
    instance.
    """
-    scores: List[float] = []
+    scores: list[float] = []
    for lora_id, info in long_context_infos.items():
        context_len = info["context_length"]
        for prompt_and_response in prompts_and_responses[context_len]:
@@ -286,7 +286,7 @@ def test_max_len(lora_llm, long_context_infos):
            generate(lora_llm, (bad_prompt, sampling_params, lora_request))

    # Also test batched
-    batched_prompts: List[Tuple[str, SamplingParams,
+    batched_prompts: list[tuple[str, SamplingParams,
                                Optional[LoRARequest]]] = []
    for lora_id_with_bad_inputs in long_context_infos:
        for lora_id, info in long_context_infos.items():

--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List
-
 import pytest

 from vllm.lora.models import LoRAModel
@@ -12,6 +10,12 @@ from vllm.model_executor.models.utils import WeightsMapper
 lora_lst = [
    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
 ]
+BAICHUAN_LORA_MODULES = [
+    "W_pack",
+    "o_proj",
+    "gate_up_proj",
+    "down_proj",
+]


 @pytest.mark.parametrize("lora_name", lora_lst)
@@ -22,12 +26,11 @@ def test_load_checkpoints(
    baichuan_regex_lora_files,
    chatglm3_lora_files,
 ):
-    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: List[str] = []
-    for module in supported_lora_modules:
+    expected_lora_modules: list[str] = []
+    for module in BAICHUAN_LORA_MODULES:
        if module in packed_modules_mapping:
            expected_lora_modules.extend(packed_modules_mapping[module])
        else:
@@ -90,12 +93,12 @@ def test_load_checkpoints(


 def test_lora_weights_mapping(baichuan_lora_files):
-    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
+
    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: List[str] = []
-    for module in supported_lora_modules:
+    expected_lora_modules: list[str] = []
+    for module in BAICHUAN_LORA_MODULES:
        if module in packed_modules_mapping:
            expected_lora_modules.extend(packed_modules_mapping[module])
        else:

--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+Script to test add_lora, remove_lora, pin_lora, list_loras functions.
+"""
+
+import os
+
+import pytest
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
+LORA_RANK = 8
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+def make_lora_request(lora_id: int):
+    return LoRARequest(lora_name=f"{lora_id}",
+                       lora_int_id=lora_id,
+                       lora_path=LORA_MODULE_PATH)
+
+
+def test_lora_functions_sync():
+
+    max_loras = 4
+    # Create engine in eager-mode. Due to high max_loras, the CI can
+    # OOM during cuda-graph capture.
+    engine_args = EngineArgs(model=MODEL_PATH,
+                             enable_lora=True,
+                             max_loras=max_loras,
+                             max_lora_rank=LORA_RANK,
+                             max_model_len=128,
+                             gpu_memory_utilization=0.8,
+                             enforce_eager=True)
+
+    llm = LLMEngine.from_engine_args(engine_args)
+
+    def run_check(fn, args, expected: list):
+        fn(args)
+        assert set(llm.list_loras()) == set(expected)
+
+    run_check(llm.add_lora, make_lora_request(1), [1])
+    run_check(llm.add_lora, make_lora_request(2), [1, 2])
+
+    # Pin LoRA 1 and test that it is never removed on subsequent adds.
+    run_check(llm.pin_lora, 1, [1, 2])
+    run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
+    run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
+    run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
+    run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
+    run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
+    run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
+    run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
+    run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
+
+    # Remove LoRA 1 and continue adding.
+    run_check(llm.remove_lora, 1, [8, 9, 10])
+    run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
+    run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
+    run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
+
+    # Remove all LoRAs
+    run_check(llm.remove_lora, 13, [12, 10, 11])
+    run_check(llm.remove_lora, 12, [10, 11])
+    run_check(llm.remove_lora, 11, [10])
+    run_check(llm.remove_lora, 10, [])
+
+
+@pytest.mark.asyncio
+async def test_lora_functions_async():
+
+    if os.getenv("VLLM_USE_V1") == "0":
+        pytest.skip(
+            reason=
+            "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
+
+    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
+    # environment variable. reload vllm.enging.async_llm_engine as
+    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
+    # env var.
+    import importlib
+
+    import vllm.engine.async_llm_engine
+    importlib.reload(vllm.engine.async_llm_engine)
+    from vllm.entrypoints.openai.api_server import (
+        build_async_engine_client_from_engine_args)
+
+    max_loras = 4
+    engine_args = AsyncEngineArgs(model=MODEL_PATH,
+                                  enable_lora=True,
+                                  max_loras=max_loras,
+                                  max_lora_rank=LORA_RANK,
+                                  max_model_len=128,
+                                  gpu_memory_utilization=0.8,
+                                  enforce_eager=True)
+
+    async def run_check(fn, args, expected: list):
+        await fn(args)
+        assert set(await llm.list_loras()) == set(expected)
+
+    async with build_async_engine_client_from_engine_args(engine_args) as llm:
+        await run_check(llm.add_lora, make_lora_request(1), [1])
+        await run_check(llm.add_lora, make_lora_request(2), [1, 2])
+
+        # Pin LoRA 1 and test that it is never removed on subsequent adds.
+        await run_check(llm.pin_lora, 1, [1, 2])
+        await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
+        await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
+        await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
+        await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
+        await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
+        await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
+        await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
+        await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
+
+        # Remove LoRA 1 and continue adding.
+        await run_check(llm.remove_lora, 1, [8, 9, 10])
+        await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
+        await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
+        await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
+
+        # Remove all LoRAs
+        await run_check(llm.remove_lora, 13, [12, 10, 11])
+        await run_check(llm.remove_lora, 12, [10, 11])
+        await run_check(llm.remove_lora, 11, [10])
+        await run_check(llm.remove_lora, 10, [])
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List
-
 import pytest

 from vllm.lora.models import LoRAModel
@@ -11,17 +9,20 @@ from vllm.model_executor.models.llama import LlamaForCausalLM

 # Provide absolute path and huggingface lora ids
 lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
+LLAMA_LORA_MODULES = [
+    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
+    "lm_head"
+]


 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
    lora_name = request.getfixturevalue(lora_fixture_name)
-    supported_lora_modules = LlamaForCausalLM.supported_lora_modules
    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
    embedding_modules = LlamaForCausalLM.embedding_modules
    embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
-    expected_lora_modules: List[str] = []
-    for module in supported_lora_modules:
+    expected_lora_modules: list[str] = []
+    for module in LLAMA_LORA_MODULES:
        if module in packed_modules_mapping:
            expected_lora_modules.extend(packed_modules_mapping[module])
        else:

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
 # SPDX-License-Identifier: Apache-2.0

 import os
-from typing import Dict, List

 import pytest
 import torch
 from safetensors.torch import load_file
 from torch import nn

+from vllm import envs
 from vllm.config import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                              MergedColumnParallelLinearWithLoRA,
@@ -19,7 +19,6 @@ from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                      WorkerLoRAManager)
-from vllm.model_executor.layers.linear import RowParallelLinear
 from vllm.platforms import current_platform

 EMBEDDING_MODULES = {
@@ -73,9 +72,9 @@ def test_from_lora_tensors(sql_lora_files, device):
            assert lora.embeddings_tensor is None


-def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str],
+def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
                device: torch.device) -> LoRAModel:
-    loras: Dict[str, LoRALayerWeights] = {}
+    loras: dict[str, LoRALayerWeights] = {}
    for name in sub_modules:
        w = model.get_submodule(name).weight
        loras[name] = LoRALayerWeights(
@@ -97,7 +96,7 @@ def create_packed_lora(
    empty_replaced_module_name=None,
 ) -> LoRAModel:
    w = model.get_submodule(module_name).weight
-    loras: Dict[str, LoRALayerWeights] = {}
+    loras: dict[str, LoRALayerWeights] = {}
    for replaced_module_name in replaced_module_names:
        if replaced_module_name == empty_replaced_module_name:
            continue
@@ -114,19 +113,16 @@ def create_packed_lora(

 def test_replace_submodules(dist_init, dummy_model):
    model = dummy_model
-    model.supported_lora_modules = ["dense1", "layer1.dense2"]
-    model.packed_modules_mapping = {}
    manager = LoRAModelManager(
        model, 1, 1, 1,
        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
        torch.device(DEVICES[0]))
    model = manager.model
-
    assert isinstance(model.get_submodule("dense1"),
                      ColumnParallelLinearWithLoRA)
    assert isinstance(model.get_submodule("layer1.dense1"),
                      ColumnParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("dense2"), RowParallelLinear)
+    assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
    assert isinstance(model.get_submodule("layer1.dense2"),
                      RowParallelLinearWithLoRA)

@@ -134,8 +130,6 @@ def test_replace_submodules(dist_init, dummy_model):
 @pytest.mark.parametrize("device", DEVICES)
 def test_lora_model_manager(dist_init, dummy_model, device):
    model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
    model_lora1 = create_lora(1,
                              model, ["layer1.dense1", "dense2", "lm_head"],
                              device=device)
@@ -190,13 +184,18 @@ def test_lora_model_manager(dist_init, dummy_model, device):

    assert manager.device == device
    assert manager.punica_wrapper.device == device
+    assert hasattr(manager, "supported_lora_modules")
+    assert sorted(manager.supported_lora_modules) == [
+        "dense1",
+        "dense2",
+        "lm_head",
+        "output",
+    ]


 @pytest.mark.parametrize("device", DEVICES)
 def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
    model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
    model_lora1 = create_lora(1,
                              model, ["layer1.dense1", "dense2", "lm_head"],
                              device=device)
@@ -289,8 +288,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
    # This tests just the LRU cache functionality, everything else is
    # tested in test_lora_model_manager
    model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
    model_lora1 = create_lora(1,
                              model, ["layer1.dense1", "dense2", "lm_head"],
                              device=device)
@@ -414,6 +411,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
    assert manager.device == device


+@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                          sql_lora_files, device):
@@ -493,6 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
            device)


+@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                sql_lora_files, device):
@@ -572,13 +571,6 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 @pytest.mark.parametrize("device", DEVICES)
 def test_packed_loras(dist_init, dummy_model_gate_up, device):
    model = dummy_model_gate_up
-    model.supported_lora_modules = ["gate_up_proj"]
-    model.packed_modules_mapping = {
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
    model_lora = create_packed_lora(
        1,
        model,

--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List
-
 import os
 import pytest

 import vllm
-from tests.utils import fork_new_process_for_each_test
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 from ..utils import models_path_prefix

+from ..utils import create_new_process_for_each_test
+
 MODEL_PATH = os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")

 PROMPT_TEMPLATE = (
@@ -29,7 +28,7 @@ EXPECTED_OUTPUT = [
 ]


-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    sampling_params = vllm.SamplingParams(
        temperature=0,
        max_tokens=5,
@@ -50,7 +49,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
        if lora_id else None,
    )
    # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
    for output in outputs:
        generated_text = output.outputs[0].text.strip()
        generated_texts.append(generated_text)
@@ -61,7 +60,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
 @pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_lora(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
@@ -84,7 +83,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
 @pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
@@ -105,7 +104,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
 @pytest.mark.xfail(
    current_platform.is_rocm(),
    reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,

--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List
-
 import pytest
 import torch
 import os
@@ -14,8 +12,16 @@ from ..utils import models_path_prefix
 MODEL_PATH = os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1")


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-              prompts: List[str]) -> List[str]:
+              prompts: list[str]) -> list[str]:

    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
    outputs = llm.generate(
@@ -24,7 +30,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()
@@ -65,45 +71,3 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                     prompts=prompts) == expected_lora_output
    assert do_sample(llm, mixtral_lora_files, lora_id=2,
                     prompts=prompts) == expected_lora_output
-
-
-@pytest.mark.parametrize("tp_size", [4])
-@pytest.mark.parametrize("fully_shard", [True, False])
-def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
-                                         tp_size, fully_shard):
-    """This LoRA model has all supported Mixtral target modules"""
-
-    if torch.cuda.device_count() < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-
-    prompts = [
-        "Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:",  # noqa: E501
-        "Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:",  # noqa: E501
-        "Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:",  # noqa: E501
-    ]
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        distributed_executor_backend="ray",
-        tensor_parallel_size=tp_size,
-        fully_sharded_loras=fully_shard,
-        max_lora_rank=32,
-    )
-
-    expected_lora_output = [
-        "A: Nothing happens if you touch the eyes of a blind man.",
-        "A: add heat",
-        "1: Craig",
-    ]
-
-    assert do_sample(llm,
-                     mixtral_lora_files_all_target_modules,
-                     lora_id=1,
-                     prompts=prompts) == expected_lora_output
-    assert do_sample(llm,
-                     mixtral_lora_files_all_target_modules,
-                     lora_id=2,
-                     prompts=prompts) == expected_lora_output
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List
 import os
-
 import pytest

 import vllm
@@ -14,7 +12,7 @@ MODEL_PATH = os.path.join(models_path_prefix, "microsoft/phi-2")
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501


-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        PROMPT_TEMPLATE.format(
            sql_prompt=
@@ -43,7 +41,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
        if lora_id else None,
    )
    # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text.strip()

--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
 # SPDX-License-Identifier: Apache-2.0
 from threading import Lock
-from typing import List

 import pytest
 import torch

-import vllm.lora.ops.triton_ops  # noqa: F401
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
+import vllm.lora.ops.torch_ops as torch_ops
+import vllm.lora.ops.triton_ops as triton_ops
+from vllm.lora.ops.triton_ops import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 from vllm.platforms import current_platform

-from .utils import (PunicaTensors, assert_close, generate_data,
-                    generate_data_for_expand_nslices,
-                    generate_data_for_nslices)
+from .utils import PunicaTensors, assert_close, generate_data_for_nslices


 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
        nslices: int, inputs_tensor: torch.Tensor,
-        lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor,
+        lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
        b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
        num_tokens: int, scaling: float):
    """
-    Wrapper around sgmv_shrink that handles any nslices.
+    Wrapper around torch_ops.sgmv_shrink that handles any nslices.
    """
    for index in range(nslices):
-        sgmv_shrink(
+        torch_ops.sgmv_shrink(
            inputs_tensor,
            lora_weights_lst[index],
            out_tensor[index],
@@ -44,7 +40,7 @@ def sgmv_shrink_for_nslices(

 def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
                            inputs_tensor: torch.Tensor,
-                            lora_weights_lst: List[torch.Tensor],
+                            lora_weights_lst: list[torch.Tensor],
                            out_tensor: torch.Tensor,
                            b_seq_start_loc: torch.Tensor,
                            seq_len_tensor: torch.Tensor,
@@ -52,11 +48,11 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
                            max_seq_length: int, num_tokens: int,
                            add_inputs: bool) -> None:
    """
-    Wrapper around sgmv_expand that handles any nslices.
+    Wrapper around torch_ops.sgmv_expand that handles any nslices.
    """
    if nslices == 1:
        # Verify the torch's sgmv_expand op
-        sgmv_expand(
+        torch_ops.sgmv_expand(
            inputs_tensor[0],
            lora_weights_lst[0],
            out_tensor,
@@ -72,7 +68,7 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
        slice_offset = 0
        for index in range(nslices):
            lora_weights = lora_weights_lst[index]
-            sgmv_expand_slice(
+            torch_ops.sgmv_expand_slice(
                inputs_tensor[index],
                lora_weights,
                out_tensor,
@@ -92,12 +88,13 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
 _dict_lock = Lock()


-def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, nslices: int, dtype: torch.dtype,
-                      device: str, seq_length: int, scaling: float):
+def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
+                             hidden_size: int, nslices: int,
+                             dtype: torch.dtype, device: str, seq_length: int,
+                             scaling: float):
    """
-    Compare outputs of vllm.sgmv_shrink kernel against a reference
-    implementation.
+    Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
+    kernels.
    """
    data: PunicaTensors = generate_data_for_nslices(
        batches,
@@ -112,44 +109,52 @@ def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
    )
    max_seq_length, token_nums = data.meta()

+    # Setup metadata information for SGMV and reference kernels
+    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
+                      data.prompt_lora_mapping, batches, max_seq_length,
+                      token_nums)
+
+    # Setup metadata information for the LoRA kernel.
+    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
+                                    max_num_tokens=token_nums,
+                                    device='cuda')
+    lora_meta.prepare_tensors(data.token_lora_mapping)
+
+    ref_out_tensor = data.ref_out_tensor
+    out_tensor = data.our_out_tensor.clone()
+
    # Preventing cache error pointer.
    with _dict_lock:
+        # lora_shrink kernel
        _LORA_A_PTR_DICT.clear()
-        torch.ops.vllm.sgmv_shrink(
+        triton_ops.lora_shrink(
            data.inputs_tensor,
            data.lora_weights,
-            data.our_out_tensor,
-            data.b_seq_start_loc,
-            data.seq_len_tensor,
-            data.prompt_lora_mapping,
-            batches,
-            max_seq_length,
-            token_nums,
+            out_tensor,
+            *lora_meta.meta_args(token_nums=token_nums),
            scaling,
        )

-        sgmv_shrink_for_nslices(
-            nslices,
-            data.inputs_tensor,
-            data.lora_weights,
-            data.ref_out_tensor,
-            data.b_seq_start_loc,
-            data.seq_len_tensor,
-            data.prompt_lora_mapping,
-            batches,
-            max_seq_length,
-            token_nums,
-            scaling,
-        )
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
+    # Reference
+    sgmv_shrink_for_nslices(
+        nslices,
+        data.inputs_tensor,
+        data.lora_weights,
+        ref_out_tensor,
+        *sgmv_meta_args,
+        scaling,
+    )

+    assert_close(out_tensor, ref_out_tensor)

-def check_sgmv_expand(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, nslices: int, dtype: torch.dtype,
-                      device: str, seq_length: int, add_inputs: bool):
+
+def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
+                             hidden_size: int, nslices: int,
+                             dtype: torch.dtype, device: str, seq_length: int,
+                             add_inputs: bool):
    """
-    Compare outputs of vllm.sgmv_expand kernel against a reference
-    implementation.
+    Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
+    kernels.
    """
    data: PunicaTensors = generate_data_for_nslices(
        batches,
@@ -165,152 +170,41 @@ def check_sgmv_expand(batches: int, num_loras: int, rank: int,

    max_seq_length, token_nums = data.meta()

+    # Setup metadata information for SGMV and reference kernels
+    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
+                      data.prompt_lora_mapping, batches, max_seq_length,
+                      token_nums)
+
+    # Setup metadata information for the LoRA kernel.
+    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
+                                    max_num_tokens=token_nums,
+                                    device='cuda')
+    lora_meta.prepare_tensors(data.token_lora_mapping)
+
+    # Setup output tensors
+    ref_out_tensor = data.ref_out_tensor
+    out_tensor = data.our_out_tensor.clone()
+
    with _dict_lock:
+        # lora_expand kernel
        _LORA_B_PTR_DICT.clear()
-        torch.ops.vllm.sgmv_expand(
-            data.inputs_tensor,
-            data.lora_weights,
-            data.our_out_tensor,
-            data.b_seq_start_loc,
-            data.seq_len_tensor,
-            data.prompt_lora_mapping,
-            batches,
-            max_seq_length,
-            token_nums,
-            offset_start=0,
-            add_inputs=add_inputs,
-        )
-
+        triton_ops.lora_expand(data.inputs_tensor,
+                               data.lora_weights,
+                               out_tensor,
+                               *lora_meta.meta_args(token_nums=token_nums),
+                               offset_start=0,
+                               add_inputs=add_inputs)
+
+    # Reference
    sgmv_expand_for_nslices(nslices,
                            hidden_size,
                            data.inputs_tensor,
                            data.lora_weights,
-                            data.ref_out_tensor,
-                            data.b_seq_start_loc,
-                            data.seq_len_tensor,
-                            data.prompt_lora_mapping,
-                            batches,
-                            max_seq_length,
-                            token_nums,
+                            ref_out_tensor,
+                            *sgmv_meta_args,
                            add_inputs=add_inputs)

-    assert_close(data.our_out_tensor, data.ref_out_tensor)
-
-
-def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, dtype: torch.dtype, device: str,
-                      scaling: float):
-    """
-    Compare vllm.bgmv_shrink against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        "shrink",
-        device,
-    )
-
-    torch.ops.vllm.bgmv_shrink(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.our_out_tensor,
-        data.token_lora_mapping,
-        scaling,
-    )
-
-    bgmv_shrink(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.ref_out_tensor,
-        data.token_lora_mapping,
-        scaling,
-    )
-
-    data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
-
-
-def check_bgmv_expand(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, dtype: torch.dtype, device: str,
-                      add_inputs: bool):
-    """
-    Compare vllm.bgmv_expand against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        "expand",
-        device,
-    )
-
-    torch.ops.vllm.bgmv_expand(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.our_out_tensor,
-        data.token_lora_mapping,
-        add_inputs=add_inputs,
-    )
-    bgmv_expand(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.ref_out_tensor,
-        data.token_lora_mapping,
-        add_inputs=add_inputs,
-    )
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
-
-
-def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
-                            hidden_size: int, nslices: int, dtype: torch.dtype,
-                            device: str, add_inputs: bool):
-    """
-    Compare vllm.bgmv_expand_slice against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data_for_expand_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        nslices,
-        device,
-    )
-
-    slice_offset = 0
-    for index in range(nslices):
-        torch.ops.vllm.bgmv_expand_slice(
-            data.inputs_tensor,
-            data.lora_weights[index],
-            data.our_out_tensor,
-            data.token_lora_mapping,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=add_inputs,
-        )
-        bgmv_expand_slice(
-            data.inputs_tensor,
-            data.lora_weights[index],
-            data.ref_out_tensor,
-            data.token_lora_mapping,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=add_inputs,
-        )
-
-        slice_offset += hidden_size
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
+    assert_close(out_tensor, ref_out_tensor)


 # Tests
@@ -440,7 +334,7 @@ SEED = [0]
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_sgmv(
+def test_kernels(
    batches: int,
    num_loras: int,
    rank: int,
@@ -451,29 +345,32 @@ def test_punica_sgmv(
    seed: int,
    op_type: str,
 ):
+    """
+    Tests LoRA kernels.
+    """
    torch.set_default_device(device)
    current_platform.seed_everything(seed)

    if op_type == "shrink":
-        check_sgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          nslices=nslices,
-                          dtype=dtype,
-                          device=device,
-                          seq_length=128,
-                          scaling=0.5)
+        check_lora_shrink_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 scaling=0.5)
    else:
-        check_sgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          nslices=nslices,
-                          dtype=dtype,
-                          device=device,
-                          seq_length=128,
-                          add_inputs=True)
+        check_lora_expand_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 add_inputs=True)


 @pytest.mark.parametrize("batches", hs_test_params['batches'])
@@ -485,7 +382,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_sgmv_hidden_size(
+def test_kernels_hidden_size(
    batches: int,
    num_loras: int,
    rank: int,
@@ -496,157 +393,29 @@ def test_punica_sgmv_hidden_size(
    seed: int,
    op_type: str,
 ):
+    """
+    Tests SGMV and LoRA kernels.
+    """
    torch.set_default_device(device)
    current_platform.seed_everything(seed)

    if op_type == "shrink":
-        check_sgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          nslices=nslices,
-                          dtype=dtype,
-                          device=device,
-                          seq_length=128,
-                          scaling=0.5)
-    else:
-        check_sgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          nslices=nslices,
-                          dtype=dtype,
-                          device=device,
-                          seq_length=128,
-                          add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_bgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    device: str,
-    seed: int,
-    op_type: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    if op_type == "shrink":
-        check_bgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          scaling=0.5)
-    else:
-        check_bgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_bgmv_hidden_size(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    device: str,
-    seed: int,
-    op_type: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    if op_type == "shrink":
-        check_bgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          scaling=0.5)
+        check_lora_shrink_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 scaling=0.5)
    else:
-        check_bgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
-                                    hidden_size: int, nslices: int,
-                                    dtype: torch.dtype, device: str,
-                                    seed: int):
-
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    check_bgmv_expand_slice(batches=batches,
-                            num_loras=num_loras,
-                            rank=rank,
-                            hidden_size=hidden_size,
-                            nslices=nslices,
-                            dtype=dtype,
-                            device=device,
-                            add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
-                                                rank: int, hidden_size: int,
-                                                nslices: int,
-                                                dtype: torch.dtype,
-                                                device: str, seed: int):
-
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    check_bgmv_expand_slice(batches=batches,
-                            num_loras=num_loras,
-                            rank=rank,
-                            hidden_size=hidden_size,
-                            nslices=nslices,
-                            dtype=dtype,
-                            device=device,
-                            add_inputs=True)
+        check_lora_expand_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 add_inputs=True)
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -3,7 +3,6 @@
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
 from dataclasses import dataclass
-from typing import List

 import pytest
 import os
@@ -22,7 +21,7 @@ class ModelWithQuantization:
    quantization: str


-MODELS: List[ModelWithQuantization]
+MODELS: list[ModelWithQuantization]
 #AWQ quantization is currently not supported in ROCm.
 if current_platform.is_rocm():
    MODELS = [
@@ -44,7 +43,7 @@ else:
 def do_sample(llm: vllm.LLM,
              lora_path: str,
              lora_id: int,
-              max_tokens: int = 256) -> List[str]:
+              max_tokens: int = 256) -> list[str]:
    raw_prompts = [
        "Give me an orange-ish brown color",
        "Give me a neon pink color",
@@ -64,7 +63,7 @@ def do_sample(llm: vllm.LLM,
        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
        if lora_id else None)
    # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
@@ -182,7 +181,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
                                 model):
    if num_gpus_available < 2:
        pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
-
+    if model.quantization == "GPTQ":
+        pytest.skip("GPTQ lora outputs are just incredibly unstable")
    llm_tp1 = vllm.LLM(
        model=model.model_path,
        enable_lora=True,

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Optional

 import pytest
 from packaging.version import Version
@@ -12,6 +12,14 @@ from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform


+@pytest.fixture(autouse=not current_platform.is_cpu())
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @dataclass
 class TestConfig:
    model_path: str
@@ -20,7 +28,7 @@ class TestConfig:
    max_loras: int = 2
    max_lora_rank: int = 16
    max_model_len: int = 4096
-    mm_processor_kwargs: Optional[Dict[str, int]] = None
+    mm_processor_kwargs: Optional[dict[str, int]] = None

    def __post_init__(self):
        if self.mm_processor_kwargs is None:
@@ -57,11 +65,11 @@ class Qwen2VLTester:
        )

    def run_test(self,
-                 images: List[ImageAsset],
-                 expected_outputs: List[str],
+                 images: list[ImageAsset],
+                 expected_outputs: list[str],
                 lora_id: Optional[int] = None,
                 temperature: float = 0,
-                 max_tokens: int = 5) -> List[str]:
+                 max_tokens: int = 5) -> list[str]:

        sampling_params = vllm.SamplingParams(
            temperature=temperature,

--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -25,12 +25,10 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
    )
    lora_request = LoRARequest("1", 1, sql_lora_files)
    assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
-        request_id="request_id", prompt="prompt", lora_request=lora_request)
+        prompt="prompt", lora_request=lora_request)
    assert reference_tokenizer.encode(
        "prompt") == await tokenizer_group.encode_async(
-            request_id="request_id",
-            prompt="prompt",
-            lora_request=lora_request)
+            prompt="prompt", lora_request=lora_request)
    assert isinstance(tokenizer_group.get_lora_tokenizer(None),
                      PreTrainedTokenizerBase)
    assert tokenizer_group.get_lora_tokenizer(

--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+from ..utils import create_new_process_for_each_test, multi_gpu_test
+
+MODEL_PATH = "ArthurZ/ilama-3.2-1B"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+EXPECTED_LORA_OUTPUT = [
+    "SELECT count(*) FROM singer",
+    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
+    "SELECT DISTINCT Country FROM singer WHERE Age  >  20",
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What are all distinct countries where singers above age 20 are from?"  # noqa: E501
+        ),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: list[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.mark.skip_v1
+@create_new_process_for_each_test()
+def test_ilama_lora(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   tensor_parallel_size=1,
+                   trust_remote_code=True,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@pytest.mark.skip_v1
+@multi_gpu_test(num_gpus=4)
+@create_new_process_for_each_test()
+def test_ilama_lora_tp4(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=False,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@pytest.mark.skip_v1
+@multi_gpu_test(num_gpus=4)
+@create_new_process_for_each_test()
+def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=True,
+                   enable_chunked_prefill=True)
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
-# SPDX-License-Identifier: Apache-2.0
-
-import shutil
-from os import path
-from tempfile import TemporaryDirectory
-from typing import List, Tuple
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors.torch import load_file, save_file
-from transformers import AutoTokenizer
-
-from vllm.lora.request import LoRARequest
-
-from ..models.utils import check_outputs_equal
-
-ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
-LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
-
-VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
-
-PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
-
-
-def llama3_1_8b_chess_lora_path():
-    return snapshot_download(
-        repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
-
-
-# can't use llama lora adapter without module name transformation
-# because ultravox nest language model
-def transform_module_names_for_ultravox(state_dict):
-    transformed_state_dict = {}
-    for key, value in state_dict.items():
-        new_key = key.replace("base_model.model",
-                              "base_model.model.language_model")
-        transformed_state_dict[new_key] = value
-    return transformed_state_dict
-
-
-def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path):
-    tensor_file = "adapter_model.safetensors"
-    state_dict = load_file(path.join(source_repo, tensor_file))
-    transformed_state_dict = transform_module_names_for_ultravox(state_dict)
-
-    save_file(transformed_state_dict, path.join(target_path, tensor_file))
-
-    config_file = "adapter_config.json"
-    shutil.copyfile(path.join(source_repo, config_file),
-                    path.join(target_path, config_file))
-    return target_path
-
-
-def _get_prompt(audio_count, question, placeholder, model_name) -> str:
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    placeholder = f"{placeholder}\n" * audio_count
-
-    return tokenizer.apply_chat_template([{
-        'role': 'user',
-        'content': f"{placeholder}{question}"
-    }],
-                                         tokenize=False,
-                                         add_generation_prompt=True)
-
-
-def test_ultravox_lora(vllm_runner):
-    """
-    TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
-    """
-    # Workaround to prevent device mismatch in Whisper.
-    # Can be removed when it is fixed upstream in transformer
-    # https://github.com/huggingface/transformers/pull/35866
-    torch.set_default_device("cpu")
-
-    llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path()
-    with TemporaryDirectory() as temp_ultravox_lora_dir:
-        llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora(
-            llama3_1_8b_chess_lora, temp_ultravox_lora_dir)
-        with vllm_runner(
-                ULTRAVOX_MODEL_NAME,
-                enforce_eager=True,
-                max_num_seqs=2,
-                enable_lora=True,
-                max_loras=1,
-                max_lora_rank=128,
-                dtype="bfloat16",
-                max_model_len=1024,
-        ) as vllm_model:
-            ultravox_outputs: List[Tuple[
-                List[int], str]] = vllm_model.generate_greedy(
-                    [
-                        _get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
-                                    ULTRAVOX_MODEL_NAME)
-                    ],
-                    256,
-                    lora_request=LoRARequest(str(1), 1,
-                                             llama3_1_8b_ultravox_chess_lora),
-                )
-
-    # run llama with and without lora to compare outputs with above
-    with vllm_runner(
-            LLMA_MODEL_NAME,
-            enforce_eager=True,
-            max_num_seqs=2,
-            enable_lora=True,
-            max_loras=1,
-            max_lora_rank=128,
-            dtype="bfloat16",
-            max_model_len=1024,
-    ) as vllm_model:
-        llama_outputs: List[Tuple[List[int], str]] = (
-            vllm_model.generate_greedy(
-                [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
-                256,
-                lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora),
-            ))
-
-    check_outputs_equal(
-        outputs_0_lst=ultravox_outputs,
-        outputs_1_lst=llama_outputs,
-        name_0="ultravox",
-        name_1="llama",
-    )
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,19 +3,46 @@
 import os
 import random
 import tempfile
+from typing import Union
 from unittest.mock import patch

+import pytest
+
+import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                         ModelConfig, ParallelConfig, SchedulerConfig,
                         VllmConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
+from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker
 from ..utils import models_path_prefix


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
+
+    def set_active_loras(worker: Union[Worker, V1Worker],
+                         lora_requests: list[LoRARequest]):
+        lora_mapping = LoRAMapping([], [])
+        if isinstance(worker, Worker):
+            # v0 case
+            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
+        else:
+            # v1 case
+            worker.model_runner.lora_manager.set_active_adapters(
+                lora_requests, lora_mapping)
+
+    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
+
    vllm_config = VllmConfig(
        model_config=ModelConfig(
            os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
@@ -26,6 +53,7 @@ def test_worker_apply_lora(sql_lora_files):
            seed=0,
            dtype="float16",
            revision=None,
+            enforce_eager=True,
        ),
        load_config=LoadConfig(
            download_dir=None,
@@ -41,16 +69,17 @@ def test_worker_apply_lora(sql_lora_files):
        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                               max_loras=32),
    )
-    worker = Worker(
+    worker = worker_cls(
        vllm_config=vllm_config,
        local_rank=0,
        rank=0,
        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
    )
+
    worker.init_device()
    worker.load_model()

-    worker.model_runner.set_active_loras([], LoRAMapping([], []))
+    set_active_loras(worker, [])
    assert worker.list_loras() == set()

    n_loras = 32
@@ -58,7 +87,7 @@ def test_worker_apply_lora(sql_lora_files):
        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
    ]

-    worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
+    set_active_loras(worker, lora_requests)
    assert worker.list_loras() == {
        lora_request.lora_int_id
        for lora_request in lora_requests
@@ -70,8 +99,7 @@ def test_worker_apply_lora(sql_lora_files):
                                            k=random.randint(1, n_loras))
        random.shuffle(iter_lora_requests)
        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
-        worker.model_runner.set_active_loras(iter_lora_requests,
-                                             LoRAMapping([], []))
+        set_active_loras(worker, lora_requests)
        assert worker.list_loras().issuperset(
            {lora_request.lora_int_id
             for lora_request in iter_lora_requests})
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
 # SPDX-License-Identifier: Apache-2.0

 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union

 import torch

@@ -12,7 +12,7 @@ class DummyLoRAManager:

    def __init__(self, device: torch.device = "cuda:0"):
        super().__init__()
-        self._loras: Dict[str, LoRALayerWeights] = {}
+        self._loras: dict[str, LoRALayerWeights] = {}
        self._device = device

    def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
@@ -77,11 +77,11 @@ class DummyLoRAManager:
        self,
        module_name: str,
        input_dim: int,
-        output_dims: List[int],
-        noop_lora_index: Optional[List[int]] = None,
+        output_dims: list[int],
+        noop_lora_index: Optional[list[int]] = None,
        rank: int = 8,
    ):
-        base_loras: List[LoRALayerWeights] = []
+        base_loras: list[LoRALayerWeights] = []
        noop_lora_index_set = set(noop_lora_index or [])

        for i, out_dim in enumerate(output_dims):
@@ -110,7 +110,7 @@ def assert_close(a, b):
 @dataclass
 class PunicaTensors:
    inputs_tensor: torch.Tensor
-    lora_weights: Union[torch.Tensor, List[torch.Tensor]]
+    lora_weights: Union[torch.Tensor, list[torch.Tensor]]
    our_out_tensor: torch.Tensor
    ref_out_tensor: torch.Tensor
    b_seq_start_loc: torch.Tensor
@@ -118,7 +118,7 @@ class PunicaTensors:
    seq_len_tensor: torch.Tensor
    token_lora_mapping: torch.Tensor

-    def meta(self) -> Tuple[int, int]:
+    def meta(self) -> tuple[int, int]:
        """
        Infer max_seq_length and token_nums from the tensors
        and return them.

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -2,14 +2,13 @@

 import os
 import time
-from typing import List

 import pytest
 import ray
 from prometheus_client import REGISTRY

+import vllm.envs as envs
 from vllm import EngineArgs, LLMEngine
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -18,15 +17,21 @@ from vllm.sampling_params import SamplingParams
 import vllm.envs as envs

 from ..utils import models_path_prefix
+from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')

-from ..conftest import MODEL_WEIGHTS_S3_BUCKET

 MODELS = [
    os.path.join(models_path_prefix, "distilbert/distilgpt2"),
 ]

-RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
-

 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
@@ -140,7 +145,7 @@ def test_metric_counter_generation_tokens_multi_step(
    "served_model_name",
    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
 def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
-                                   served_model_name: List[str]) -> None:
+                                   served_model_name: list[str]) -> None:
    with vllm_runner(model,
                     dtype=dtype,
                     disable_log_stats=False,
@@ -149,10 +154,11 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
        metrics_tag_content = stat_logger.labels["model_name"]

+    if envs.VLLM_CI_USE_S3:
+        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
    if served_model_name is None or served_model_name == []:
-        actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
-        assert metrics_tag_content == actual_model_name, (
-            f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
+        assert metrics_tag_content == model, (
+            f"Metrics tag model_name is wrong! expect: {model!r}\n"
            f"actual: {metrics_tag_content!r}")
    else:
        assert metrics_tag_content == served_model_name[0], (
@@ -178,10 +184,11 @@ async def test_async_engine_log_metrics_regression(
    when disable_log_stats=False
    (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
    """
-    engine_args = AsyncEngineArgs(model=model,
-                                  dtype=dtype,
-                                  disable_log_stats=disable_log_stats,
-                                  load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    engine_args = AsyncEngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=disable_log_stats,
+    )
    async_engine = AsyncLLMEngine.from_engine_args(engine_args)
    for i, prompt in enumerate(example_prompts):
        results = async_engine.generate(
@@ -193,7 +200,7 @@ async def test_async_engine_log_metrics_regression(
        async for _ in results:
            pass

-    assert_metrics(async_engine.engine, disable_log_stats,
+    assert_metrics(model, async_engine.engine, disable_log_stats,
                   len(example_prompts))


@@ -208,10 +215,11 @@ def test_engine_log_metrics_regression(
    max_tokens: int,
    disable_log_stats: bool,
 ) -> None:
-    engine_args = EngineArgs(model=model,
-                             dtype=dtype,
-                             disable_log_stats=disable_log_stats,
-                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    engine_args = EngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=disable_log_stats,
+    )
    engine = LLMEngine.from_engine_args(engine_args)
    for i, prompt in enumerate(example_prompts):
        engine.add_request(
@@ -222,7 +230,9 @@ def test_engine_log_metrics_regression(
    while engine.has_unfinished_requests():
        engine.step()

-    assert_metrics(engine, disable_log_stats, len(example_prompts))
+    if envs.VLLM_CI_USE_S3:
+        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
+    assert_metrics(model, engine, disable_log_stats, len(example_prompts))


 @pytest.mark.parametrize("model", MODELS)
@@ -289,14 +299,15 @@ def test_metric_spec_decode_interval(
 ) -> None:
    k = 5

-    engine_args = EngineArgs(model=model,
-                             dtype=dtype,
-                             disable_log_stats=False,
-                             gpu_memory_utilization=0.4,
-                             speculative_model=model,
-                             num_speculative_tokens=k,
-                             enforce_eager=True,
-                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    engine_args = EngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=False,
+        gpu_memory_utilization=0.4,
+        speculative_model=model,
+        num_speculative_tokens=k,
+        enforce_eager=True,
+    )

    engine = LLMEngine.from_engine_args(engine_args)

@@ -363,7 +374,7 @@ def test_metric_spec_decode_interval(
        cleanup_dist_env_and_memory()


-def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
+def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
                   num_requests: int) -> None:
    if disable_log_stats:
        with pytest.raises(AttributeError):
@@ -374,7 +385,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
        # Ensure the count bucket of request-level histogram metrics matches
        # the number of requests as a simple sanity check to ensure metrics are
        # generated
-        labels = {'model_name': engine.model_config.model}
+        labels = {'model_name': model}
        request_histogram_metrics = [
            "vllm:e2e_request_latency_seconds",
            "vllm:request_prompt_tokens",

--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import Dict, List, Optional
+from typing import Optional

 from typing_extensions import TypedDict


 class ServerConfig(TypedDict, total=False):
    model: str
-    arguments: List[str]
+    arguments: list[str]
    system_prompt: Optional[str]
    supports_parallel: Optional[bool]
    supports_rocm: Optional[bool]


-ARGS: List[str] = ["--max-model-len", "1024"]
+ARGS: list[str] = ["--max-model-len", "1024"]

-CONFIGS: Dict[str, ServerConfig] = {
+CONFIGS: dict[str, ServerConfig] = {
    "mistral": {
        "model":
        "mistralai/Mistral-7B-Instruct-v0.3",

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
 # SPDX-License-Identifier: Apache-2.0

-from typing import List
-
 import pytest

 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
@@ -9,7 +7,10 @@ from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                   ReLUSquaredActivation,
                                                   SiluAndMul)
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import (
+    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
+    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+from vllm.platforms import current_platform


 # Registered subclass for test
@@ -51,7 +52,7 @@ class Relu3(ReLUSquaredActivation):
        # All but RMSNorm
        ("all,-rms_norm", 4, [0, 1, 1, 1], True),
    ])
-def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
+def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int],
                     default_on: bool):
    vllm_config = VllmConfig(compilation_config=CompilationConfig(
        level=torch_level, custom_ops=env.split(",")))
@@ -89,3 +90,27 @@ def test_enabled_ops_invalid(env: str):
            custom_ops=env.split(",")))
        with set_current_vllm_config(vllm_config):
            RMSNorm(1024).enabled()
+
+
+@pytest.mark.parametrize("add_residual", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="AITER is a feature exclusive for ROCm")
+def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
+                           use_rocm_aiter_norm: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
+    rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
+
+    if not add_residual:
+        if current_platform.is_rocm() and int(use_rocm_aiter) and int(
+                use_rocm_aiter_norm):
+            assert rms_norm_func == rocm_aiter_rms_norm
+        else:
+            assert rms_norm_func == rms_norm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_norm):
+        assert rms_norm_func == rocm_aiter_fused_add_rms_norm
+    else:
+        assert rms_norm_func == fused_add_rms_norm
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
 # SPDX-License-Identifier: Apache-2.0

+import json
 import pickle

 import pytest
@@ -17,19 +18,38 @@ from vllm.model_executor.guided_decoding.outlines_logits_processors import (
 from vllm.sampling_params import GuidedDecodingParams
 from ..utils import models_path_prefix

+
 MODEL_NAME = os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta')
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = [
+    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+]
+GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
+REASONING_MODEL_NAME = os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
+
+
+# Initialize the tokenizer for the model here to avoid repeated loading
+@pytest.fixture(scope="module")
+def zephyr_7B_tokenzer():
+    return AutoTokenizer.from_pretrained(MODEL_NAME)


-def test_guided_logits_processors(sample_regex, sample_json_schema):
+@pytest.fixture(scope="module")
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
+                                  sample_json_schema):
    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
-    tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta'))
-    regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
+    regex_LP = RegexLogitsProcessor(sample_regex,
+                                    zephyr_7B_tokenzer,
+                                    reasoner=None)
    json_LP = JSONLogitsProcessor(sample_json_schema,
-                                  tokenizer,
-                                  whitespace_pattern=None)
+                                  zephyr_7B_tokenzer,
+                                  whitespace_pattern=None,
+                                  reasoner=None)

-    token_ids = tokenizer.encode(
+    token_ids = zephyr_7B_tokenzer.encode(
        f"Give an example IPv4 address with this regex: {sample_regex}")
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
@@ -37,7 +57,7 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
    assert tensor.shape == original_tensor.shape
    assert not torch.allclose(tensor, original_tensor)

-    token_ids = tokenizer.encode(
+    token_ids = zephyr_7B_tokenzer.encode(
        f"Give an employee profile that fits this schema: {sample_json_schema}"
    )
    tensor = torch.rand(32000)
@@ -52,7 +72,8 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 @pytest.mark.parametrize("is_local", [True, False])
 async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
                                                 sample_regex,
-                                                 sample_json_schema):
+                                                 sample_json_schema,
+                                                 zephyr_7B_tokenzer):

    config = ModelConfig(
        MODEL_NAME,
@@ -63,15 +84,14 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
        seed=0,
        dtype="bfloat16",
    )
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    token_ids = tokenizer.encode(
+    token_ids = zephyr_7B_tokenzer.encode(
        f"Give an example IPv4 address with this regex: {sample_regex}")
    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)

    regex_lp = get_local_guided_decoding_logits_processor(
-            regex_request, tokenizer, config) if is_local else \
+            regex_request, zephyr_7B_tokenzer, config) if is_local else \
            await get_guided_decoding_logits_processor(
-                    regex_request, tokenizer, config)
+                    regex_request, zephyr_7B_tokenzer, config)
    assert regex_lp is not None
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
@@ -79,13 +99,85 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
    assert tensor.shape == original_tensor.shape
    assert not torch.allclose(tensor, original_tensor)

-    token_ids = tokenizer.encode(
+    token_ids = zephyr_7B_tokenzer.encode(
        f"Give an employee profile that fits this schema: {sample_json_schema}"
    )
    json_request = GuidedDecodingParams(json=sample_json_schema,
                                        backend=backend)
    json_lp = await get_guided_decoding_logits_processor(
-        json_request, tokenizer, config)
+        json_request, zephyr_7B_tokenzer, config)
+    assert json_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("backend",
+                         GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT)
+@pytest.mark.parametrize("is_local", [True, False])
+@pytest.mark.parametrize("reasoning_backend", ["deepseek_r1"])
+async def test_guided_logits_processor_with_reasoning(
+        backend: str, is_local: bool, reasoning_backend: str, sample_regex,
+        sample_json_schema, deepseek_r1_qwen_tokenizer):
+
+    config = ModelConfig(
+        REASONING_MODEL_NAME,
+        task="generate",
+        tokenizer=REASONING_MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        f"Give an example IPv4 address with this regex: {sample_regex}."
+        "<think>here is the thinking process")
+    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
+
+    regex_lp = get_local_guided_decoding_logits_processor(regex_request,
+                    deepseek_r1_qwen_tokenizer, config,
+                    reasoning_backend) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, deepseek_r1_qwen_tokenizer, config,
+                    reasoning_backend)
+    assert regex_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = regex_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert torch.allclose(tensor, original_tensor)
+
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        f"Give an employee profile that fits this schema: {sample_json_schema}."
+        "<think>here is the thinking process")
+    json_request = GuidedDecodingParams(json=sample_json_schema,
+                                        backend=backend)
+    json_lp = get_local_guided_decoding_logits_processor(
+        json_request, deepseek_r1_qwen_tokenizer, config,
+        reasoning_backend) if is_local else \
+        await get_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
+    assert json_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert torch.allclose(tensor, original_tensor)
+
+    # Thinking is over, so the tensor should change.
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        f"Give an employee profile that fits this schema: {sample_json_schema}."
+        "<think>here is the thinking process</think> Then")
+    json_request = GuidedDecodingParams(json=sample_json_schema,
+                                        backend=backend)
+    json_lp = get_local_guided_decoding_logits_processor(
+        json_request, deepseek_r1_qwen_tokenizer, config,
+        reasoning_backend) if is_local else \
+        await get_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
    assert json_lp is not None
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
@@ -112,9 +204,17 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
        GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")


-def test_pickle_xgrammar_tokenizer_data():
+def test_guided_decoding_backend_options():
+    """Test backend-specific options"""
+    params = GuidedDecodingParams(
+        backend="xgrammar:option-1,option-2,option-3")
+    assert params.backend_options() == ["option-1", "option-2", "option-3"]
+
+    no_fallback = GuidedDecodingParams(backend="xgrammar:option-1,no-fallback")
+    assert no_fallback.no_fallback()
+

-    # TODO: move to another test file for xgrammar
+def test_pickle_xgrammar_tokenizer_data():
    try:
        import xgrammar as xgr
    except ImportError:
@@ -122,7 +222,11 @@ def test_pickle_xgrammar_tokenizer_data():

    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
        TokenizerData)
-    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
+    tokenizer_data = TokenizerData(
+        metadata=
+        '{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
+        encoded_vocab=['!', '"', '#', '$', '%'],
+    )
    pickled = pickle.dumps(tokenizer_data)

    assert pickled is not None
@@ -130,4 +234,5 @@ def test_pickle_xgrammar_tokenizer_data():
    depickled: TokenizerData = pickle.loads(pickled)

    assert depickled is not None
-    assert depickled.vocab_type == xgr.VocabType.RAW
+    assert json.loads(
+        depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value