resolve merge confilcts

51679bbd · zhuwenwen · 4095d0db · 1af090b5 · 51679bbd · 51679bbd
Commit 51679bbd authored Feb 01, 2024 by zhuwenwen
20 changed files
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
+import random
+
+import os
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+
+from vllm.model_executor.parallel_utils import custom_all_reduce as custom_ar
+from vllm.model_executor.parallel_utils.communication_op import (
+    tensor_model_parallel_all_reduce)
+from vllm.test_utils import (init_test_distributed_environment,
+                             multi_process_tensor_parallel)
+
+random.seed(42)
+test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
+for i, v in enumerate(test_sizes):
+    test_sizes[i] -= v % 8
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def graph_allreduce(world_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, world_size, rank,
+                                      distributed_init_port)
+
+    custom_ar.init_custom_ar()
+    for sz in test_sizes:
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            with custom_ar.capture():
+                # use integers so result matches NCCL exactly
+                inp1 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                inp2 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                torch.cuda.synchronize()
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph):
+                    out1 = tensor_model_parallel_all_reduce(inp1)
+                    # the input buffer is immediately modified to test
+                    # synchronization
+                    dist.all_reduce(inp1)
+                    out2 = tensor_model_parallel_all_reduce(inp2)
+                    dist.all_reduce(inp2)
+            graph.replay()
+            assert torch.allclose(out1, inp1)
+            assert torch.allclose(out2, inp2)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_allreduce(world_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, world_size, rank,
+                                      distributed_init_port)
+
+    sz = 1024
+    custom_ar.init_custom_ar()
+    fa = custom_ar.get_handle()
+    inp = torch.ones(sz, dtype=torch.float32, device=device)
+    out = fa.all_reduce_unreg(inp)
+    assert torch.allclose(out, inp * world_size)
+
+    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+    out = fa.all_reduce_unreg(inp)
+    assert torch.allclose(out, inp * world_size)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
+def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
+    multi_process_tensor_parallel(tensor_parallel_size, test_target)
+
+
+if __name__ == "__main__":
+    multi_process_tensor_parallel(2, graph_allreduce)
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
+import os
+import subprocess
+import time
+
+import sys
+import pytest
+import requests
+import ray  # using Ray for overall ease of process management, parallel requests, and debugging.
+import openai  # use the official client for correctness check
+
+MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
+
+pytestmark = pytest.mark.asyncio
+
+
+@ray.remote(num_gpus=1)
+class ServerRunner:
+
+    def __init__(self, args):
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"
+        self.proc = subprocess.Popen(
+            ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+        self._wait_for_server()
+
+    def ready(self):
+        return True
+
+    def _wait_for_server(self):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(
+                        "http://localhost:8000/health").status_code == 200:
+                    break
+            except Exception as err:
+                if self.proc.poll() is not None:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+
+                time.sleep(0.5)
+                if time.time() - start > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err
+
+    def __del__(self):
+        if hasattr(self, "proc"):
+            self.proc.terminate()
+
+
+@pytest.fixture(scope="session")
+def server():
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        MODEL_NAME,
+        "--dtype",
+        "bfloat16",  # use half precision for speed and memory savings in CI environment
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+
+
+@pytest.fixture(scope="session")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+
+
+async def test_single_completion(server, client: openai.AsyncOpenAI):
+    completion = await client.completions.create(model=MODEL_NAME,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+
+
+async def test_single_chat_session(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+    )
+    assert chat_completion.id is not None
+    assert chat_completion.choices is not None and len(
+        chat_completion.choices) == 1
+    assert chat_completion.choices[0].message is not None
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+async def test_completion_streaming(server, client: openai.AsyncOpenAI):
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    single_usage = single_completion.usage
+
+    stream = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.usage == single_usage
+    assert "".join(chunks) == single_output
+
+
+async def test_chat_streaming(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks = []
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert "".join(chunks) == output
+
+
+async def test_batch_completions(server, client: openai.AsyncOpenAI):
+    # test simple list
+    batch = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(batch.choices) == 2
+    assert batch.choices[0].text == batch.choices[1].text
+
+    # test n = 2
+    batch = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        n=2,
+        max_tokens=5,
+        temperature=0.0,
+        extra_body=dict(
+            # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client.
+            use_beam_search=True),
+    )
+    assert len(batch.choices) == 4
+    assert batch.choices[0].text != batch.choices[
+        1].text, "beam search should be different"
+    assert batch.choices[0].text == batch.choices[
+        2].text, "two copies of the same prompt should be the same"
+    assert batch.choices[1].text == batch.choices[
+        3].text, "two copies of the same prompt should be the same"
+
+    # test streaming
+    batch = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+    )
+    texts = [""] * 2
+    async for chunk in batch:
+        assert len(chunk.choices) == 1
+        choice = chunk.choices[0]
+        texts[choice.index] += choice.text
+    assert texts[0] == texts[1]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
-from typing import List, Tuple
-
 import pytest
-import torch
-
-
-def create_kv_caches(
-    num_blocks: int,
-    block_size: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-
-    scale = head_size**-0.5
-    x = 16 // torch.tensor([], dtype=dtype).element_size()
-    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_caches = []
-    for _ in range(num_layers):
-        key_cache = torch.empty(size=key_cache_shape,
-                                dtype=dtype,
-                                device=device)
-        key_cache.uniform_(-scale, scale)
-        key_caches.append(key_cache)
-
-    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_caches = []
-    for _ in range(num_layers):
-        value_cache = torch.empty(size=value_cache_shape,
-                                  dtype=dtype,
-                                  device=device)
-        value_cache.uniform_(-scale, scale)
-        value_caches.append(value_cache)
-    return key_caches, value_caches
+from vllm.utils import create_kv_caches_with_random


 @pytest.fixture()
 def kv_cache_factory():
-    return create_kv_caches
+    return create_kv_caches_with_random
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,14 +6,16 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask

-from vllm._C import ops
+from vllm._C import ops, cache_ops
 from vllm.utils import get_max_shared_memory_bytes

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
-NUM_BLOCKS = 40000  # Arbitrary values for testing
+# There may not be enough gpu memory due to large NUM_BLOCKS.
+# Reduce NUM_BLOCKS when it happens.
+NUM_BLOCKS = 4321  # Arbitrary values for testing
 PARTITION_SIZE = 512

 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -23,6 +25,7 @@ NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
+KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
 SEEDS = [0]
 DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]

@@ -105,6 +108,7 @@ def ref_single_query_cached_kv_attention(
 @pytest.mark.parametrize("use_alibi", USE_ALIBI)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", DEVICES)
 def test_paged_attention(
@@ -116,6 +120,7 @@ def test_paged_attention(
    use_alibi: bool,
    block_size: int,
    dtype: torch.dtype,
+    kv_cache_dtype: str,
    seed: int,
    device: int,
 ) -> None:
@@ -158,8 +163,9 @@ def test_paged_attention(

    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
-                                                num_kv_heads, head_size, dtype,
-                                                seed, gpu_id)
+                                                num_kv_heads, head_size,
+                                                kv_cache_dtype, dtype, seed,
+                                                gpu_id)
    key_cache, value_cache = key_caches[0], value_caches[0]

    # Call the paged attention kernel.
@@ -177,6 +183,7 @@ def test_paged_attention(
            block_size,
            max_context_len,
            alibi_slopes,
+            kv_cache_dtype,
        )
    elif version == "v2":
        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
@@ -209,11 +216,30 @@ def test_paged_attention(
            block_size,
            max_context_len,
            alibi_slopes,
+            kv_cache_dtype,
        )
    else:
        raise AssertionError(f"Unknown version: {version}")

    # Run the reference implementation.
+    if kv_cache_dtype == "fp8_e5m2":
+        # Convert cache data back to dtype.
+        x = 16 // torch.tensor([], dtype=dtype).element_size()
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
+                           block_size, x)
+        dequantized_key_cache = torch.empty(size=key_cache_shape,
+                                            dtype=dtype,
+                                            device=gpu_id)
+        cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
+        key_cache = dequantized_key_cache
+
+        value_cache_shape = value_cache.shape
+        dequantized_value_cache = torch.empty(size=value_cache_shape,
+                                              dtype=dtype,
+                                              device=gpu_id)
+        cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
+        value_cache = dequantized_value_cache
+
    ref_output = torch.empty_like(query)
    ref_single_query_cached_kv_attention(
        ref_output,
@@ -230,7 +256,12 @@ def test_paged_attention(
    # NOTE(woosuk): Due to the kernel-level differences in the two
    # implementations, there is a small numerical difference in the two
    # outputs. Thus, we use a relaxed tolerance for the test.
-    assert torch.allclose(output, ref_output, atol=1e-3, rtol=1e-5)
+    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+    # so we use a relaxed tolerance for the test.
+    atol, rtol = 1e-3, 1e-5
+    if kv_cache_dtype == "fp8_e5m2":
+        atol, rtol = 1e-2, 1e-5
+    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)


 def ref_multi_query_kv_attention(

--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -3,18 +3,22 @@ import random
 import pytest
 import torch

+from typing import Tuple
+
 from vllm._C import cache_ops

+COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-NUM_TOKENS = [83]  # Arbitrary values for testing
+NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [8, 16, 32]
-NUM_BLOCKS = [1024, 36000]  # Arbitrary values for testing
+NUM_BLOCKS = [1024, 3600]  # Arbitrary values for testing
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
 DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]


 @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@@ -26,6 +30,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_copy_blocks(
    kv_cache_factory,
@@ -38,6 +43,7 @@ def test_copy_blocks(
    dtype: torch.dtype,
    seed: int,
    device: int,
+    kv_cache_dtype: str,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
@@ -59,7 +65,8 @@ def test_copy_blocks(
    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
                                                num_layers, num_heads,
-                                                head_size, dtype, seed, gpu_id)
+                                                head_size, kv_cache_dtype,
+                                                dtype, seed, gpu_id)

    # Clone the KV caches.
    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
@@ -124,7 +131,7 @@ def test_reshape_and_cache(
    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
                                                num_heads, head_size, dtype,
-                                                seed, gpu_id)
+                                                None, seed, gpu_id)
    key_cache, value_cache = key_caches[0], value_caches[0]

    # Clone the KV caches.
@@ -133,7 +140,7 @@ def test_reshape_and_cache(

    # Call the reshape_and_cache kernel.
    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                slot_mapping)
+                                slot_mapping, "auto")

    # Run the reference implementation.
    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
@@ -149,3 +156,68 @@ def test_reshape_and_cache(

    assert torch.allclose(key_cache, cloned_key_cache)
    assert torch.allclose(value_cache, cloned_value_cache)
+
+
+@pytest.mark.parametrize("direction", COPYING_DIRECTION)
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
+@torch.inference_mode()
+def test_swap_blocks(
+    kv_cache_factory,
+    direction: Tuple[str, str],
+    num_mappings: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: int,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    src_device = f"{direction[0]}:{device}" if direction[
+        0] == "cuda" else direction[0]
+    dst_device = f"{direction[1]}:{device}" if direction[
+        1] == "cuda" else direction[1]
+
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    # For the same device, mapping must not overlap
+    if src_device == dst_device:
+        remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+        dst_blocks = random.sample(remaining_blocks, num_mappings)
+    else:
+        dst_blocks = random.sample(range(num_blocks), num_mappings)
+
+    block_mapping = dict(zip(src_blocks, dst_blocks))
+
+    # Create the KV caches on the first device.
+    src_key_caches, src_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, dtype, seed,
+        src_device)
+
+    # Create the KV caches on the second device.
+    dist_key_caches, dist_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, dtype, seed,
+        dst_device)
+
+    src_key_caches_clone = src_key_caches[0].clone()
+    src_value_caches_clone = src_value_caches[0].clone()
+
+    # Call the swap_blocks kernel.
+    cache_ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping)
+    cache_ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
+                          block_mapping)
+
+    for src, dst in block_mapping.items():
+        assert torch.allclose(src_key_caches_clone[src].cpu(),
+                              dist_key_caches[0][dst].cpu())
+        assert torch.allclose(src_value_caches_clone[src].cpu(),
+                              dist_value_caches[0][dst].cpu())
--- a/tests/kernels/test_fused_moe.py
+++ b/tests/kernels/test_fused_moe.py
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.activation import SiluAndMul
+
+
+def torch_moe(a, w1, w2, topk_weight, topk_ids):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk_ids.shape[1], 1).reshape(-1, D)
+    out = torch.zeros(B * topk_ids.shape[1],
+                      w2.shape[1],
+                      dtype=a.dtype,
+                      device=a.device)
+    topk_ids = topk_ids.view(-1)
+    topk_weight = topk_weight.view(-1)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1)).sum(dim=1)
+
+
+@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_fused_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+):
+    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+
+    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    score = torch.softmax(score, dim=-1)
+    topk_weight, topk_ids = torch.topk(score, topk)
+
+    triton_output = fused_moe(a, w1, w2, topk_weight, topk_ids, False)
+    torch_output = torch_moe(a, w1, w2, topk_weight, topk_ids)
+    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
+import random
+import pytest
+import time
+
+import torch
+from vllm.model_executor.layers.triton_kernel.prefix_prefill import (
+    context_attention_fwd)
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
+
+NUM_HEADS = [12]
+HEAD_SIZES = [128]
+DTYPES = [torch.float16]
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+) -> None:
+    random.seed(0)
+    torch.manual_seed(0)
+    MAX_SEQ_LEN = 1024
+    MAX_CTX_LEN = 1024
+    BS = 10
+    cache_size = 640
+    block_size = 32
+    max_block_per_request = 64
+    subquery_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
+    seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
+
+    num_tokens = sum(subquery_lens)
+    query = torch.empty(num_tokens,
+                        num_heads,
+                        head_size,
+                        dtype=dtype,
+                        device='cuda')
+    query.uniform_(-1e-3, 1e-3)
+    output = torch.empty(num_tokens,
+                         num_heads,
+                         head_size,
+                         dtype=dtype,
+                         device='cuda')
+
+    kv = torch.empty(sum(seq_lens),
+                     2,
+                     num_heads,
+                     head_size,
+                     dtype=dtype,
+                     device='cuda')
+    kv.uniform_(-1e-3, 1e-3)
+    key, value = kv.unbind(dim=1)
+
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_heads,
+                          head_size,
+                          dtype=dtype,
+                          device='cuda')
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_heads,
+                          head_size,
+                          dtype=dtype,
+                          device='cuda')
+    k = torch.zeros(sum(subquery_lens),
+                    num_heads,
+                    head_size,
+                    dtype=dtype,
+                    device='cuda')
+    v = torch.zeros(sum(subquery_lens),
+                    num_heads,
+                    head_size,
+                    dtype=dtype,
+                    device='cuda')
+    values = torch.arange(0, cache_size, dtype=torch.long, device='cuda')
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:BS * max_block_per_request].view(
+        BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.long, device='cuda')
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long, device='cuda')
+    b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
+                                            dtype=torch.long,
+                                            device='cuda'),
+                               dim=0)
+    max_input_len = MAX_SEQ_LEN
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long,
+                                                device='cuda'),
+                                   dim=0)
+    for i in range(BS):
+        for j in range(subquery_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_heads, head_size)[start_slot:end_slot].copy_(
+                key[start_loc:end_loc])
+            v_cache.view(-1, num_heads, head_size)[start_slot:end_slot].copy_(
+                value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+    # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
+    k_cache = k_cache.view(-1, block_size, num_heads, head_size // 8,
+                           8).permute(0, 2, 3, 1, 4).contiguous()
+    # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
+    v_cache = v_cache.view(-1, block_size, num_heads,
+                           head_size).permute(0, 2, 3, 1).contiguous()
+
+    # Warm up the Triton kernel by calling it once before actually measuring generation time
+    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
+                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
+    torch.cuda.synchronize()
+    start_time = time.time()
+    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
+                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+
+    scale = float(1.0 / (head_size**0.5))
+
+    attn_op = xops.fmha.cutlass.FwOp()
+
+    attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        subquery_lens, seq_lens)
+    output_ref = xops.memory_efficient_attention_forward(
+        query.unsqueeze(0),
+        key.unsqueeze(0),
+        value.unsqueeze(0),
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+        op=attn_op,
+    )
+    torch.cuda.synchronize()
+    start_time = time.time()
+    output_ref = xops.memory_efficient_attention_forward(
+        query.unsqueeze(0),
+        key.unsqueeze(0),
+        value.unsqueeze(0),
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+        op=attn_op,
+    )
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+    output_ref = output_ref.squeeze(0)
+    assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
--- a/tests/lora/__init__.py
+++ b/tests/lora/__init__.py
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
+import contextlib
+import gc
+import tempfile
+from collections import OrderedDict
+from unittest.mock import patch, MagicMock
+
+import pytest
+import ray
+import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
+
+import vllm
+from vllm.config import LoRAConfig
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parallel_utils.parallel_state import (
+    destroy_model_parallel, initialize_model_parallel)
+
+
+def cleanup():
+    destroy_model_parallel()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+    ray.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def cleanup_fixture():
+    yield
+    cleanup()
+
+
+@pytest.fixture
+def dist_init():
+    if not torch.distributed.is_initialized():
+        temp_file = tempfile.mkstemp()[1]
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=1,
+            rank=0,
+            init_method=f"file://{temp_file}",
+        )
+        torch.distributed.all_reduce(torch.zeros(1).cuda())
+    initialize_model_parallel(1, 1)
+    yield
+    cleanup()
+
+
+@pytest.fixture
+def dist_init_torch_only():
+    if torch.distributed.is_initialized():
+        return
+    temp_file = tempfile.mkstemp()[1]
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=1,
+        rank=0,
+        init_method=f"file://{temp_file}",
+    )
+
+
+@pytest.fixture
+def dummy_model() -> nn.Module:
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", ColumnParallelLinear(764, 100)),
+            ("dense2", RowParallelLinear(100, 50)),
+            (
+                "layer1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", ColumnParallelLinear(100, 10)),
+                        ("dense2", RowParallelLinear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("output", ColumnParallelLinear(50, 10)),
+            ("outact", nn.Sigmoid()),
+            # Special handling for lm_head & sampler
+            ("lm_head", ParallelLMHead(512, 10)),
+            ("sampler", Sampler(512))
+        ]))
+    model.config = MagicMock()
+    return model
+
+
+@pytest.fixture
+def dummy_model_gate_up() -> nn.Module:
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", ColumnParallelLinear(764, 100)),
+            ("dense2", RowParallelLinear(100, 50)),
+            (
+                "layer1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", ColumnParallelLinear(100, 10)),
+                        ("dense2", RowParallelLinear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
+            ("outact", nn.Sigmoid()),
+            # Special handling for lm_head & sampler
+            ("lm_head", ParallelLMHead(512, 10)),
+            ("sampler", Sampler(512))
+        ]))
+    model.config = MagicMock()
+    return model
+
+
+@pytest.fixture(scope="session")
+def sql_lora_files():
+    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+
+
+@pytest.fixture
+def llama_2_7b_engine_extra_embeddings() -> nn.Module:
+    cleanup()
+    get_model_old = get_model
+
+    def get_model_patched(model_config, lora_config=None):
+        return get_model_old(model_config,
+                             LoRAConfig(max_loras=4, max_lora_rank=8))
+
+    with patch("vllm.worker.model_runner.get_model", get_model_patched):
+        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+    yield engine.llm_engine
+    del engine
+    cleanup()
+
+
+@pytest.fixture
+def llama_2_7b_model_extra_embeddings(
+        llama_2_7b_engine_extra_embeddings) -> nn.Module:
+    yield llama_2_7b_engine_extra_embeddings.driver_worker.model_runner.model
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
+import pytest
+import random
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import List, Optional, Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+
+from vllm.lora.layers import (
+    ColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    QKVParallelLinearWithLora,
+    VocabParallelEmbeddingWithLoRA,
+    RowParallelLinearWithLoRA,
+    SamplerWithLoRA,
+    LoRAMapping,
+    BaseLayerWithLoRA,
+)
+from vllm.lora.models import LoRALayerWeights, convert_mapping, PackedLoRALayerWeights
+from vllm.config import LoRAConfig
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear,
+                                               QKVParallelLinear)
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.utils import set_random_seed
+
+from .utils import DummyLoRAManager
+
+TOLERANCES = {
+    torch.float16: (5e-3, 5e-3),
+    torch.float32: (5e-3, 5e-3),
+    torch.bfloat16: (3e-2, 2e-2),
+}
+
+
+def get_random_id_to_index(num_loras: int,
+                           num_slots: int,
+                           log: bool = True) -> List[Optional[int]]:
+    """Creates a random lora_id_to_index mapping.
+
+    Args:
+        num_loras: The number of active loras in the mapping.
+        num_slots: The number of slots in the mapping. Must be larger
+            than num_loras.
+        log: Whether to log the output.
+    """
+
+    if num_loras > num_slots:
+        raise ValueError(
+            f"num_loras is higher than num_slots: {num_loras} > {num_slots}. "
+            "num_loras must be less than or equal to num_slots.")
+
+    slots: List[Optional[int]] = [None] * num_slots
+    random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist()
+    for lora_id, slot_idx in enumerate(random_slot_selections, start=1):
+        slots[slot_idx] = lora_id
+
+    if log:
+        print(f"Created lora_id_to_index mapping: {slots}.")
+
+    return slots
+
+
+def populate_loras(
+    id_to_index: List[Optional[int]],
+    layer: BaseLayerWithLoRA,
+    layer_weights: torch.Tensor,
+    generate_embeddings_tensor: int = 0,
+    repeats: int = 1,
+) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]:
+    """This method populates the lora layers with lora weights.
+
+    Args:
+        id_to_index: a list of lora ids. The index of the lora id
+            represents which memory slot the lora matrices are
+            stored in. A None value indicates a free slot.
+        layer: the LoRAlayer to populate.
+        layer_weights: the PyTorch tensor containing the layer's
+            weights.
+        generate_embeddings_tensor: whether to generate an
+            embeddings tensor for each LoRA.
+        repeats: must only be set for column parallel packed
+            layers. Indicates the number of loras to compose
+            together to create a single lora layer.
+    """
+
+    # Dictionary that maps the lora ID to the
+    # corresponding lora weights.
+    lora_dict: Dict[int, LoRALayerWeights] = dict()
+
+    # Dictionary that maps the lora ID to the
+    # corresponding subloras. Only useful when
+    # repeats > 1.
+    sublora_dict: Dict[int, List[LoRALayerWeights]] = dict()
+
+    for slot_idx, lora_id in enumerate(id_to_index):
+        if lora_id is not None:
+            subloras = []
+            sublora_len = layer_weights.shape[0] // repeats
+            for i in range(repeats):
+                sublora = DummyLoRAManager().init_random_lora(
+                    module_name=f"fake_{i}",
+                    weight=layer_weights,
+                    generate_embeddings_tensor=generate_embeddings_tensor,
+                )
+                sublora.lora_b = sublora.lora_b[:, (sublora_len *
+                                                    i):(sublora_len * (i + 1))]
+                sublora.optimize()
+                subloras.append(sublora)
+
+            lora = PackedLoRALayerWeights.pack(
+                subloras) if repeats > 1 else subloras[0]
+
+            layer.set_lora(
+                slot_idx,
+                lora_a=lora.lora_a,
+                lora_b=lora.lora_b,
+                embeddings_tensor=lora.embeddings_tensor,
+            )
+
+            lora_dict[lora_id] = lora
+            sublora_dict[lora_id] = subloras
+
+    return lora_dict, sublora_dict
+
+
+def create_random_inputs(
+    active_lora_ids: List[int],
+    num_inputs: int,
+    input_size: Tuple[int, ...],
+    input_range: Tuple[float, float],
+    input_type: torch.dtype = torch.int,
+) -> Tuple[List[torch.Tensor], List[int], List[int]]:
+    """Creates random inputs.
+
+    Args:
+        active_lora_ids: lora IDs of active lora weights.
+        num_inputs: the number of inputs to create.
+        input_size: the size of each individual input.
+        input_range: the range of values to include in the input.
+            input_range[0] <= possible input values < input_range[1]
+        input_type: the type of values in the input.
+    """
+
+    low, high = input_range
+
+    inputs, index_mapping, prompt_mapping = [], [], []
+    for _ in range(num_inputs):
+        if input_type == torch.int:
+            inputs.append(
+                torch.randint(low=int(low),
+                              high=int(high),
+                              size=input_size,
+                              device="cuda"))
+        else:
+            inputs.append(
+                torch.rand(size=input_size, dtype=input_type, device="cuda") *
+                high + low)
+
+        lora_id = random.choice(active_lora_ids)
+        index_mapping += [lora_id] * input_size[0]
+        prompt_mapping += [lora_id]
+
+    return inputs, index_mapping, prompt_mapping
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+def test_embeddings(dist_init, num_loras) -> None:
+
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_embedding_layer():
+        embedding = VocabParallelEmbedding(512, 256)
+        embedding.weight.data = torch.rand_like(embedding.weight.data)
+        embedding.weight.data[512:, :] = 0
+        lora_embedding = VocabParallelEmbeddingWithLoRA(embedding)
+        lora_embedding.create_lora_weights(max_loras, lora_config)
+
+        return embedding, lora_embedding
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        embedding, lora_embedding = create_random_embedding_layer()
+
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_embedding,
+            layer_weights=embedding.weight.T,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, 512),
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+        lora_embedding.set_mapping(*mapping_info)
+
+        lora_result = lora_embedding(torch.cat(inputs))
+
+        expected_results = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = embedding(input_)
+            after_a = F.embedding(
+                input_,
+                lora.lora_a,
+            )
+            result += (after_a @ lora.lora_b)
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_embedding.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, 512),
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+        lora_embedding.set_mapping(*mapping_info, )
+
+        lora_result = lora_embedding(torch.cat(inputs))
+        expected_result = embedding(torch.cat(inputs))
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
+@torch.inference_mode()
+# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:
+
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_embedding_layer():
+        embedding = VocabParallelEmbedding(512, 256)
+        embedding_data = torch.rand_like(embedding.weight.data)
+        embedding.weight.data = embedding_data
+        embedding.weight.data[512:, :] = 0
+        expanded_embedding = VocabParallelEmbedding(
+            512 + lora_config.lora_extra_vocab_size * max_loras,
+            256,
+            org_num_embeddings=512)
+        expanded_embedding.weight.data[:512, :] = embedding_data
+        # We need to deepcopy the embedding as it will be modifed
+        # in place
+        lora_embedding = VocabParallelEmbeddingWithLoRA(
+            deepcopy(expanded_embedding))
+        lora_embedding.create_lora_weights(max_loras, lora_config)
+
+        return expanded_embedding, lora_embedding
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        expanded_embedding, lora_embedding = create_random_embedding_layer()
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_embedding,
+            layer_weights=torch.zeros(
+                (256, 512 + lora_config.lora_extra_vocab_size)),
+            generate_embeddings_tensor=256,
+        )
+
+        # All embeddings tensors have the same shape.
+        embeddings_tensors = [
+            lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
+        ]
+        embeddings_tensor_len = embeddings_tensors[0].shape[0]
+
+        # Add empty embeddings_tensors for unoccupied lora slots.
+        for _ in range(max_loras - len(embeddings_tensors)):
+            embeddings_tensors.append(
+                torch.zeros(embeddings_tensors[0].shape, device="cuda"))
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, 512),
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        original_inputs = deepcopy(inputs)
+
+        # Force some of the inputs to be in the extended embeddings range
+        # to guarantee that their behavior is tested.
+        for input_, original_input_, lora_id in zip(inputs, original_inputs,
+                                                    prompt_mapping):
+            embedding_id = lora_id - 1
+            input_[-1] = 512 + (embedding_id * embeddings_tensor_len)
+            original_input_[-1] = 512
+            input_[-2] = 512 + ((embedding_id + 1) * embeddings_tensor_len - 1)
+            original_input_[-2] = 512 + embeddings_tensor_len - 1
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+        lora_embedding.set_mapping(*mapping_info, )
+
+        expanded_embedding.weight[512:512 +
+                                  (embeddings_tensor_len *
+                                   max_loras)] = torch.cat(embeddings_tensors)
+
+        lora_result = lora_embedding(torch.cat(original_inputs))
+
+        expected_results = []
+        for input_, original_input_, lora_id in zip(inputs, original_inputs,
+                                                    prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = expanded_embedding(input_)
+            after_a = F.embedding(
+                original_input_,
+                lora.lora_a,
+            )
+            result += (after_a @ lora.lora_b)
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_embedding.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=num_loras * 3,
+            input_size=(200, ),
+            input_range=(1, 512),
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        original_inputs = deepcopy(inputs)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+        lora_embedding.set_mapping(*mapping_info, )
+
+        lora_result = lora_embedding(torch.cat(original_inputs))
+        expected_result = expanded_embedding(torch.cat(inputs))
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+def test_lm_head_sampler(dist_init, num_loras) -> None:
+
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_sampler_layer():
+        linear = ParallelLMHead(32000 + lora_config.lora_extra_vocab_size,
+                                1024, 32000)
+        linear.weight.data = torch.rand_like(linear.weight.data)
+        linear.weight.data[:, 32000:] = 0
+        sampler = Sampler(32000 + lora_config.lora_extra_vocab_size, 32000)
+        lora_sampler = SamplerWithLoRA(sampler, 1024, linear.weight.dtype,
+                                       linear.weight.device)
+        lora_sampler.create_lora_weights(max_loras, lora_config)
+
+        return linear, sampler, lora_sampler
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, sampler, lora_sampler = create_random_sampler_layer()
+
+        # NOTE: all the generated loras share the same embeddings tensor.
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_sampler,
+            layer_weights=linear.weight,
+            generate_embeddings_tensor=1024,
+        )
+        embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor
+        embeddings_tensor_len = embeddings_tensor.shape[0]
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=8 * num_loras,  # * 3,
+            input_size=(1, 1024),
+            input_range=(0, 1),
+            input_type=torch.float32,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        input_ = torch.rand(20, 1024, device="cuda")
+        mapping_info = convert_mapping(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            32000,
+            lora_config.lora_extra_vocab_size,
+        )
+        lora_sampler.set_mapping(*mapping_info, )
+
+        lora_result = lora_sampler._get_logits(hidden_states=torch.cat(inputs),
+                                               embedding=linear.weight,
+                                               embedding_bias=None)
+
+        original_weight = linear.weight.clone()
+
+        linear.weight[sampler.org_vocab_size:sampler.org_vocab_size +
+                      embeddings_tensor_len] = embeddings_tensor
+
+        sampler.org_vocab_size = 32000 + lora_config.lora_extra_vocab_size
+        expected_results = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = sampler._get_logits(hidden_states=input_,
+                                         embedding=linear.weight,
+                                         embedding_bias=None)
+            result[:, 32000 + embeddings_tensor_len:] = float("-inf")
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+        sampler.org_vocab_size = 32000
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_sampler.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=8 * num_loras * 3,
+            input_size=(1, 1024),
+            input_range=(0, 1),
+            input_type=torch.float32,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       32000,
+                                       lora_config.lora_extra_vocab_size)
+        lora_sampler.set_mapping(*mapping_info, )
+
+        lora_result = lora_sampler._get_logits(hidden_states=torch.cat(inputs),
+                                               embedding=original_weight,
+                                               embedding_bias=None)[:, :32000]
+        expected_result = sampler._get_logits(hidden_states=torch.cat(inputs),
+                                              embedding=original_weight,
+                                              embedding_bias=None)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("orientation", ["row", "column"])
+def test_linear_parallel(dist_init, num_loras, orientation) -> None:
+
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_random_linear_parallel_layer():
+        if orientation == "row":
+            linear = RowParallelLinear(4096, 4096, bias=False)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = RowParallelLinearWithLoRA(linear)
+        else:
+            linear = ColumnParallelLinear(4096, 4096, bias=False)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = ColumnParallelLinearWithLoRA(linear)
+        lora_linear.create_lora_weights(max_loras, lora_config)
+
+        return linear, lora_linear
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+        linear, lora_linear = create_random_linear_parallel_layer()
+
+        lora_dict, _ = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float32,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+        lora_linear.set_mapping(*mapping_info, )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            lora = lora_dict[lora_id]
+            result = linear(input_)[0]
+            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        # Check that resetting the lora weights succeeds
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float32,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(lora_mapping, id_to_index, max_loras,
+                                       512, lora_config.lora_extra_vocab_size)
+        lora_linear.set_mapping(*mapping_info, )
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("repeats", [2, 3])
+def test_column_parallel_packed(dist_init, num_loras, repeats) -> None:
+
+    max_loras = 8
+    lora_config = LoRAConfig(max_loras=max_loras,
+                             max_lora_rank=8,
+                             lora_dtype=torch.float16)
+
+    def create_column_parallel_packed_layer():
+        if repeats == 2:
+            linear = MergedColumnParallelLinear(4096, [4096] * repeats,
+                                                bias=False)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = MergedColumnParallelLinearWithLoRA(linear)
+        else:
+            linear = QKVParallelLinear(4096, 64, 32, bias=False)
+            linear.weight.data = torch.rand_like(linear.weight.data)
+            lora_linear = QKVParallelLinearWithLora(linear)
+
+        @dataclass
+        class FakeConfig:
+            hidden_size = 4096
+            num_key_value_heads = 32
+            num_attention_heads = 32
+
+        lora_linear.create_lora_weights(max_loras,
+                                        lora_config,
+                                        model_config=FakeConfig())
+
+        return linear, lora_linear
+
+    for i in range(10):
+        set_random_seed(i)
+
+        id_to_index = get_random_id_to_index(num_loras, max_loras)
+
+        linear, lora_linear = create_column_parallel_packed_layer()
+
+        lora_dict, sublora_dict = populate_loras(
+            id_to_index,
+            layer=lora_linear,
+            layer_weights=linear.weight,
+            repeats=repeats,
+        )
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=list(lora_dict.keys()),
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float32,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+        lora_linear.set_mapping(*mapping_info)
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+
+        expected_results = []
+        for input_, lora_id in zip(inputs, prompt_mapping):
+            result = linear(input_)[0]
+            subloras = sublora_dict[lora_id]
+            for i, sublora in enumerate(subloras):
+                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * (
+                    i + 1
+                )] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling
+            expected_results.append(result)
+        expected_result = torch.cat(expected_results)
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
+
+        for slot_idx in range(max_loras):
+            lora_linear.reset_lora(slot_idx)
+
+        inputs, index_mapping, prompt_mapping = create_random_inputs(
+            active_lora_ids=[0],
+            num_inputs=32 * num_loras,
+            input_size=(1, 4096),
+            input_range=(0, 1),
+            input_type=torch.float32,
+        )
+        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
+
+        mapping_info = convert_mapping(
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
+            lora_config.lora_extra_vocab_size,
+        )
+        lora_linear.set_mapping(*mapping_info)
+
+        lora_result = lora_linear(torch.cat(inputs))[0]
+        expected_result = linear(torch.cat(inputs))[0]
+
+        rtol, atol = TOLERANCES[lora_result.dtype]
+        assert torch.allclose(lora_result,
+                              expected_result,
+                              rtol=rtol,
+                              atol=atol)
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
+import pytest
+import ray
+
+import vllm
+from vllm.lora.request import LoRARequest
+from .conftest import cleanup
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+
+
+def do_sample(llm, lora_path: str, lora_id: int):
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]"
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=256,
+                                          stop=["[/assistant]"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [1])
+def test_llama_lora(sql_lora_files, tp_size):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < tp_size:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=tp_size)
+
+    expected_no_lora_output = [
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",
+        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",
+    ]
+    expected_lora_output = [
+        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",
+        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
+        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",
+        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",
+        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
+        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "
+    ]
+
+    print("lora adapter created")
+    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
+
+    print("lora 1")
+    assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output
+
+    print("no lora")
+    assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output
+
+    print("lora 2")
+    assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output
+
+    print("removing lora")
+
+
+@pytest.mark.skip("Requires multiple GPUs")
+def test_llama_tensor_parallel_equality(sql_lora_files):
+    # Cannot use as it will initialize torch.cuda too early...
+    # if torch.cuda.device_count() < 4:
+    #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
+
+    llm_tp1 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=1)
+    output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
+
+    del llm_tp1
+    cleanup()
+
+    llm_tp2 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=2)
+    output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
+
+    del llm_tp2
+    cleanup()
+
+    assert output_tp1 == output_tp2
+
+    llm_tp4 = vllm.LLM(MODEL_PATH,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=4)
+    output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
+
+    del llm_tp4
+    cleanup()
+
+    assert output_tp1 == output_tp4
+
+
+def test_llama_lora_warmup(sql_lora_files):
+    """Test that the LLM initialization works with a warmup LORA path and is more conservative"""
+
+    @ray.remote(num_gpus=1)
+    def get_num_gpu_blocks_lora():
+        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
+        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
+        return num_gpu_blocks_lora_warmup
+
+    @ray.remote(num_gpus=1)
+    def get_num_gpu_blocks_no_lora():
+        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
+        num_gpu_blocks_no_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
+        return num_gpu_blocks_no_lora_warmup
+
+    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
+    num_gpu_blocks_no_lora_warmup = ray.get(
+        get_num_gpu_blocks_no_lora.remote())
+    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
+        "The warmup with lora should be more"
+        " conservative than without lora, therefore the number of memory blocks for the KV cache should be "
+        "less when using lora than when not using lora")
--- a/tests/lora/test_lora.py
+++ b/tests/lora/test_lora.py
+import pytest
+import torch
+
+from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
+
+from .utils import DummyLoRAManager
+
+TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4]
+QKV_TENSOR_SIZES = [
+    (8192, 1024, 1024),
+    (8192 // 8, 1024 // 8, 1024 // 8),
+    (4096, 4096, 4096),
+    (4096 // 2, 4096 // 2, 4096 // 2),
+]
+BATCH_SIZES = [8, 32, 256]
+RANKS = [8]
+DTYPES = [torch.float16]
+TOLERANCES = {
+    torch.float16: (5e-3, 5e-3),
+    torch.bfloat16: (3e-2, 2e-2),
+}
+
+
+@pytest.mark.parametrize("m", TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora(m, n, k, rank, dtype) -> None:
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight = torch.rand([m, n], device="cuda", dtype=dtype)
+
+    manager.init_random_lora(module_name, weight, rank=rank)
+    lora = manager.get_module_lora(module_name)
+
+    input = torch.rand(k, n, device="cuda", dtype=dtype)
+    expected = input @ lora.lora_a @ lora.lora_b * lora.scaling
+
+    lora_a_stack = torch.zeros(8,
+                               1,
+                               lora.lora_a.shape[1],
+                               lora.lora_a.shape[0],
+                               device="cuda",
+                               dtype=dtype)
+    lora_b_stack = torch.zeros(8,
+                               1,
+                               lora.lora_b.shape[1],
+                               lora.lora_b.shape[0],
+                               device="cuda",
+                               dtype=dtype)
+    for i in range(lora_a_stack.shape[0]):
+        lora_a_stack[i][0] = lora.lora_a.T
+        lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T
+
+    output = torch.zeros(k, m, device="cuda", dtype=dtype)
+    _apply_lora(
+        input, lora_a_stack, lora_b_stack,
+        torch.randint(0, lora_a_stack.shape[0], (len(input), ), device="cuda"),
+        output)
+
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora(input, lora_a_stack, lora_b_stack,
+                torch.full((len(input), ), -1, device="cuda"), output)
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
+
+
+@pytest.mark.parametrize("m", TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None:
+    if m % 2 != 0:
+        pytest.skip("m must be divisible by 2")
+    if m // 2 not in TENSOR_SIZES:
+        pytest.skip("m//2 must be in TENSOR_SIZES")
+
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight = torch.rand([m // 2, n], device="cuda", dtype=dtype)
+
+    manager.init_random_lora(module_name + "1", weight, rank=rank)
+    lora_1 = manager.get_module_lora(module_name + "1")
+    manager.init_random_lora(module_name + "2", weight, rank=rank)
+    lora_2 = manager.get_module_lora(module_name + "2")
+
+    input = torch.rand(k, n, device="cuda", dtype=dtype)
+    expected = torch.cat([
+        input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling,
+        input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling
+    ],
+                         dim=1)
+
+    lora_a_stacks = [
+        torch.zeros(8,
+                    1,
+                    lora_1.lora_a.shape[1],
+                    lora_1.lora_a.shape[0],
+                    device="cuda",
+                    dtype=dtype) for i in range(2)
+    ]
+    lora_b_stacks = [
+        torch.zeros(8,
+                    1,
+                    lora_1.lora_b.shape[1],
+                    lora_1.lora_b.shape[0],
+                    device="cuda",
+                    dtype=dtype) for i in range(2)
+    ]
+    for i in range(lora_a_stacks[0].shape[0]):
+        lora_a_stacks[0][i][0] = lora_1.lora_a.T
+        lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T
+        lora_a_stacks[1][i][0] = lora_2.lora_a.T
+        lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T
+
+    output = torch.zeros(k, m, device="cuda", dtype=dtype)
+    _apply_lora_packed_nslice(
+        input, lora_a_stacks, lora_b_stacks,
+        torch.randint(0,
+                      lora_a_stacks[0].shape[0], (len(input), ),
+                      device="cuda"), output, (m // 2, m // 2))
+
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
+                              torch.full((len(input), ), -1, device="cuda"),
+                              output, (m // 2, m // 2))
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
+
+
+@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES)
+@pytest.mark.parametrize("n", TENSOR_SIZES)
+@pytest.mark.parametrize("k", BATCH_SIZES)
+@pytest.mark.parametrize("rank", RANKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None:
+    manager = DummyLoRAManager()
+
+    module_name = "module"
+    weight_q = torch.empty(qkv[0], n, device="cuda", dtype=dtype)
+    weight_kv = torch.empty(qkv[1], n, device="cuda", dtype=dtype)
+
+    manager.init_random_lora(module_name + "q", weight_q, rank=rank)
+    lora_q = manager.get_module_lora(module_name + "q")
+    manager.init_random_lora(module_name + "k", weight_kv, rank=rank)
+    lora_k = manager.get_module_lora(module_name + "k")
+    manager.init_random_lora(module_name + "v", weight_kv, rank=rank)
+    lora_v = manager.get_module_lora(module_name + "v")
+
+    input = torch.rand(k, n, device="cuda", dtype=dtype)
+    expected = torch.cat([
+        input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling,
+        input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling,
+        input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling
+    ],
+                         dim=1)
+
+    lora_a_stacks = [
+        torch.zeros(8,
+                    1,
+                    lora_q.lora_a.shape[1],
+                    lora_q.lora_a.shape[0],
+                    device="cuda",
+                    dtype=dtype)
+    ] + [
+        torch.zeros(8,
+                    1,
+                    lora_k.lora_a.shape[1],
+                    lora_k.lora_a.shape[0],
+                    device="cuda",
+                    dtype=dtype) for i in range(2)
+    ]
+    lora_b_stacks = [
+        torch.zeros(8,
+                    1,
+                    lora_q.lora_b.shape[1],
+                    lora_q.lora_b.shape[0],
+                    device="cuda",
+                    dtype=dtype)
+    ] + [
+        torch.zeros(8,
+                    1,
+                    lora_k.lora_b.shape[1],
+                    lora_k.lora_b.shape[0],
+                    device="cuda",
+                    dtype=dtype) for i in range(2)
+    ]
+    for i in range(lora_a_stacks[0].shape[0]):
+        lora_a_stacks[0][i][0] = lora_q.lora_a.T
+        lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T
+        lora_a_stacks[1][i][0] = lora_k.lora_a.T
+        lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T
+        lora_a_stacks[2][i][0] = lora_v.lora_a.T
+        lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T
+
+    output = torch.zeros(k, sum(qkv), device="cuda", dtype=dtype)
+    _apply_lora_packed_nslice(
+        input, lora_a_stacks, lora_b_stacks,
+        torch.randint(0,
+                      lora_a_stacks[0].shape[0], (len(input), ),
+                      device="cuda"), output, (qkv[0], qkv[1], qkv[2]))
+
+    rtol, atol = TOLERANCES[dtype]
+    assert torch.allclose(expected, output, rtol=rtol, atol=atol)
+
+    output[:] = 0
+    _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks,
+                              torch.full((len(input), ), -1, device="cuda"),
+                              output, (qkv[0], qkv[1], qkv[2]))
+    assert torch.allclose(torch.zeros_like(output), output)
+
+    manager.reset_lora()
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
+import os
+from typing import List
+
+import pytest
+import torch
+from safetensors.torch import load_file
+from torch import nn
+
+from vllm.config import LoRAConfig
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                              RowParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithLoRA)
+from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.lora.models import (EMBEDDING_MODULES, LoRAModel, LoRAModelManager,
+                              LRUCacheLoRAModelManager, LoRAMapping)
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
+                                      WorkerLoRAManager)
+from vllm.model_executor.layers.linear import RowParallelLinear
+
+
+def test_from_lora_tensors(sql_lora_files):
+    tensors = load_file(
+        os.path.join(sql_lora_files, "adapter_model.safetensors"))
+    new_embeddings = load_file(
+        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+    lora_model = LoRAModel.from_lora_tensors(1,
+                                             8,
+                                             16,
+                                             tensors,
+                                             "cuda",
+                                             embeddings=new_embeddings)
+    for module_name, lora in lora_model.loras.items():
+        assert lora.module_name == module_name
+        assert lora.rank == 8
+        assert lora.lora_alpha == 16
+        assert lora.lora_a is not None
+        assert lora.lora_b is not None
+        assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
+                ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        assert lora.lora_a.shape[1] == 8
+        embeddings_module = next(
+            (k for k in EMBEDDING_MODULES if k in module_name), None)
+        if embeddings_module:
+            assert torch.equal(
+                lora.embeddings_tensor,
+                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
+                    device=lora.embeddings_tensor.device))
+        else:
+            assert lora.embeddings_tensor is None
+
+
+def create_lora(lora_id: int, model: nn.Module,
+                sub_modules: List[str]) -> LoRAModel:
+    loras = {}
+    for name in sub_modules:
+        w = model.get_submodule(name).weight
+        loras[name] = LoRALayerWeights(
+            name,
+            8,
+            16,
+            torch.rand([w.shape[1], 8], device="cuda"),
+            torch.rand([8, w.shape[0]], device="cuda"),
+        )
+    return LoRAModel(lora_id, 8, loras)
+
+
+def create_packed_lora(
+    lora_id: int,
+    model: nn.Module,
+    module_name,
+    replaced_module_names,
+    empty_replaced_module_name=None,
+) -> LoRAModel:
+    w = model.get_submodule(module_name).weight
+    loras = {}
+    for replaced_module_name in replaced_module_names:
+        if replaced_module_name == empty_replaced_module_name:
+            continue
+        loras[replaced_module_name] = LoRALayerWeights(
+            replaced_module_name,
+            8,
+            16,
+            torch.rand([w.shape[1], 8], device="cuda"),
+            torch.rand([8, w.shape[0] // len(replaced_module_names)],
+                       device="cuda"),
+        )
+    return LoRAModel(lora_id, 8, loras)
+
+
+def test_replace_submodules(dist_init, dummy_model):
+    model = dummy_model
+    manager = LoRAModelManager(model,
+                               1,
+                               1,
+                               1,
+                               LoRAConfig(max_lora_rank=8,
+                                          max_cpu_loras=8,
+                                          max_loras=8),
+                               lora_target_modules=["dense1", "layer1.dense2"])
+    model = manager.model
+
+    assert isinstance(model.get_submodule("dense1"),
+                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("layer1.dense1"),
+                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("dense2"), RowParallelLinear)
+    assert isinstance(model.get_submodule("layer1.dense2"),
+                      RowParallelLinearWithLoRA)
+
+
+def test_lora_model_manager(dist_init, dummy_model):
+    model = dummy_model
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    manager = LoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2),
+        lora_target_modules=["dense1", "dense2", "lm_head"])
+    assert all(x is None for x in manager.lora_index_to_id)
+    assert manager.add_lora(model_lora1)
+    assert manager.activate_lora(1)
+    assert manager.lora_index_to_id[0] == 1
+    assert not manager.add_lora(model_lora1)
+    assert not manager.activate_lora(1)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(2)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert not manager.add_lora(model_lora2)
+    assert not manager.activate_lora(2)
+    assert manager.add_lora(model_lora3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    with pytest.raises(ValueError):
+        assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.remove_lora(model_lora2.id)
+    assert manager.lora_index_to_id[1] is None
+    assert not manager.remove_lora(model_lora2.id)
+    assert manager.remove_lora(model_lora1.id)
+    assert not manager.remove_lora(model_lora1.id)
+    assert manager.add_lora(model_lora1)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] is None
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] is None
+    assert manager.activate_lora(2)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+
+
+def test_lora_lru_cache_model_manager(dist_init, dummy_model):
+    model = dummy_model
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    manager = LRUCacheLoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2),
+        lora_target_modules=["dense1", "dense2", "lm_head"])
+    assert all(x is None for x in manager.lora_index_to_id)
+    assert manager.add_lora(model_lora1)
+    assert manager.activate_lora(1)
+    assert manager.lora_index_to_id[0] == 1
+    assert not manager.add_lora(model_lora1)
+    assert not manager.activate_lora(1)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(2)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert not manager.add_lora(model_lora2)
+    assert not manager.activate_lora(2)
+    assert manager.add_lora(model_lora3)
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+    assert manager.remove_lora(model_lora2.id)
+    assert manager.lora_index_to_id[1] is None
+    assert not manager.remove_lora(model_lora2.id)
+    assert manager.remove_lora(model_lora1.id)
+    assert not manager.remove_lora(model_lora1.id)
+    assert manager.add_lora(model_lora1)
+    assert manager.activate_lora(1)
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.add_lora(model_lora2)
+    assert manager.deactivate_lora(3)
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_lora(2)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 1
+    assert manager.activate_lora(3)
+    assert manager.lora_index_to_id[0] == 2
+    assert manager.lora_index_to_id[1] == 3
+
+
+def test_lru_lora_model_manager(dist_init, dummy_model):
+    # This tests just the LRU cache functionality, everything else is
+    # tested in test_lora_model_manager
+    model = dummy_model
+    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
+    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
+    manager = LRUCacheLoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2),
+        ["dense1", "dense2", "lm_head"])
+
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    # Add up to capacity
+    assert manager.add_lora(model_lora1)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(1)
+    assert manager.activate_lora(2)
+
+    assert set(manager.list_loras()) == {1, 2}
+    assert manager.lora_index_to_id[0] == 1
+    assert manager.lora_index_to_id[1] == 2
+
+    # Add over capacity
+    assert manager.add_lora(model_lora3)
+    assert manager.add_lora(model_lora4)
+    assert manager.activate_lora(3)
+    assert manager.activate_lora(4)
+
+    assert set(manager.list_loras()) == {3, 4}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 4
+
+    # Add 3 again to move it to the top and then add 2
+    # should return false since it's in already
+    assert not manager.add_lora(model_lora3)
+    assert not manager.activate_lora(3)
+    assert manager.add_lora(model_lora2)
+    assert manager.activate_lora(2)
+
+    assert set(manager.list_loras()) == {3, 2}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 2
+
+    # Remove manually
+    assert manager.remove_lora(3)
+    assert not manager.remove_lora(3)
+
+    assert set(manager.list_loras()) == {2}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 2
+
+    assert manager.add_lora(model_lora3)
+    assert manager.activate_lora(3)
+    assert manager.add_lora(model_lora4)
+    assert manager.activate_lora(4)
+
+    assert set(manager.list_loras()) == {3, 4}
+    assert manager.lora_index_to_id[0] == 3
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.remove_oldest_lora()
+    assert set(manager.list_loras()) == {4}
+    assert manager.lora_index_to_id[0] is None
+    assert manager.lora_index_to_id[1] == 4
+
+    assert manager.remove_oldest_lora()
+    assert set(manager.list_loras()) == set()
+    assert all(x is None for x in manager.lora_index_to_id)
+
+    assert not manager.remove_oldest_lora()
+    assert set(manager.list_loras()) == set()
+    assert all(x is None for x in manager.lora_index_to_id)
+
+
+def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
+                                       sql_lora_files):
+    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    worker_lora_manager = LRUCacheWorkerLoRAManager(
+        4, 2, llama_2_7b_model_extra_embeddings.config.vocab_size, lora_config,
+        torch.device("cuda"))
+    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
+
+    mapping = LoRAMapping([], [])
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("3", 3, sql_lora_files),
+        LoRARequest("4", 4, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2, 3, 4}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 3
+    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files),
+        LoRARequest("5", 5, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2, 4, 5}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 4
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("6", 6, sql_lora_files),
+        LoRARequest("7", 7, sql_lora_files),
+        LoRARequest("8", 8, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 6, 7, 8}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 7
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 8
+    assert worker_lora_manager._lora_manager.lora_index_to_id[3] == 6
+
+    # Over capacity
+    with pytest.raises(RuntimeError):
+        worker_lora_manager.set_active_loras([
+            LoRARequest("10", 10, sql_lora_files),
+            LoRARequest("11", 11, sql_lora_files),
+            LoRARequest("12", 12, sql_lora_files),
+            LoRARequest("13", 13, sql_lora_files),
+            LoRARequest("14", 14, sql_lora_files)
+        ], mapping)
+
+
+def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
+                             sql_lora_files):
+    # Should remove every LoRA not specified in the request.
+    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    worker_lora_manager = WorkerLoRAManager(
+        4, 2, llama_2_7b_model_extra_embeddings.config.vocab_size, lora_config,
+        torch.device("cuda"))
+    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)
+
+    mapping = LoRAMapping([], [])
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("3", 3, sql_lora_files),
+        LoRARequest("4", 4, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 3, 4}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 3
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 4
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("2", 2, sql_lora_files),
+        LoRARequest("5", 5, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1, 2, 5}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 2
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 5
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files),
+        LoRARequest("1", 1, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {1}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 1
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] is None
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] is None
+
+    worker_lora_manager.set_active_loras([
+        LoRARequest("6", 6, sql_lora_files),
+        LoRARequest("7", 7, sql_lora_files),
+        LoRARequest("8", 8, sql_lora_files)
+    ], mapping)
+    assert worker_lora_manager.list_loras() == {6, 7, 8}
+    assert worker_lora_manager._lora_manager.lora_index_to_id[0] == 8
+    assert worker_lora_manager._lora_manager.lora_index_to_id[1] == 6
+    assert worker_lora_manager._lora_manager.lora_index_to_id[2] == 7
+
+    # Over capacity
+    with pytest.raises(RuntimeError):
+        worker_lora_manager.set_active_loras([
+            LoRARequest("10", 10, sql_lora_files),
+            LoRARequest("11", 11, sql_lora_files),
+            LoRARequest("12", 12, sql_lora_files),
+            LoRARequest("13", 13, sql_lora_files),
+            LoRARequest("14", 14, sql_lora_files)
+        ], mapping)
+
+
+def test_packed_loras(dist_init, dummy_model_gate_up):
+    model = dummy_model_gate_up
+    model_lora = create_packed_lora(
+        1,
+        model,
+        module_name="gate_up_proj",
+        replaced_module_names=["gate_proj", "up_proj"])
+    model_lora1 = create_packed_lora(
+        2,
+        model,
+        module_name="gate_up_proj",
+        replaced_module_names=["gate_proj", "up_proj"],
+        empty_replaced_module_name="gate_proj",
+    )
+
+    manager = LoRAModelManager(
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2),
+        ["gate_up_proj"])
+    model = manager.model
+
+    assert isinstance(model.get_submodule("gate_up_proj"),
+                      MergedColumnParallelLinearWithLoRA)
+    assert manager.add_lora(model_lora)
+    assert manager.add_lora(model_lora1)
+
+    packed_lora = model_lora.get_lora("gate_up_proj")
+    assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
+
+    assert torch.allclose(packed_lora.lora_a[0],
+                          model_lora.get_lora("gate_proj").lora_a)
+    assert torch.allclose(packed_lora.lora_b[0],
+                          model_lora.get_lora("gate_proj").lora_b)
+    assert torch.allclose(packed_lora.lora_a[1],
+                          model_lora.get_lora("up_proj").lora_a)
+    assert torch.allclose(packed_lora.lora_b[1],
+                          model_lora.get_lora("up_proj").lora_b)
+
+    packed_lora1 = model_lora1.get_lora("gate_up_proj")
+    assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
+
+    assert packed_lora1.lora_a[0] is None
+    assert packed_lora1.lora_b[0] is None
+    assert torch.allclose(packed_lora1.lora_a[1],
+                          model_lora1.get_lora("up_proj").lora_a)
+    assert torch.allclose(packed_lora1.lora_b[1],
+                          model_lora1.get_lora("up_proj").lora_b)
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
+# Based on code from https://github.com/punica-ai/punica
+
+import pytest
+import torch
+
+import vllm.lora.punica as punica
+
+
+def assert_close(a, b):
+    rtol, atol = {
+        torch.float16: (5e-3, 5e-3),
+        torch.bfloat16: (3e-2, 2e-2),
+        torch.float32: (None, None),
+    }[a.dtype]
+    torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
+
+
+def _lora_ref_impl(
+    y_final: torch.Tensor,
+    x: torch.Tensor,
+    wa_T_all: torch.Tensor,
+    wb_T_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+):
+    y_stage_1 = torch.empty(
+        (x.size(0), wa_T_all.size(-2)),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    bs = x.shape[0]
+    s = torch.tensor(scale, dtype=torch.float32, device=x.device)
+    for i, lora_idx in zip(range(bs), indicies.cpu().tolist()):
+        xi = x[i].unsqueeze(0).to(torch.float32)
+        wa = wa_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
+        wb = wb_T_all[lora_idx, layer_idx].transpose(-1, -2).to(torch.float32)
+
+        tmp = xi @ wa
+        y_stage_1[i] = tmp.squeeze(0)
+        y_final[i] += (tmp @ wb).squeeze(0) * s
+    return y_final, y_stage_1
+
+
+H1 = H2 = [
+    128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120,
+    5504, 5632, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, 32000,
+    32256, 32512, 32768, 33024
+]
+SEED = [0xabcdabcd987]
+
+
+@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
+@pytest.mark.parametrize("h1", H1)
+@pytest.mark.parametrize("h2", H2)
+@pytest.mark.parametrize("seed", SEED)
+@torch.inference_mode()
+def test_lora_correctness(dtype_str, h1, h2, seed):
+    torch.manual_seed(seed)
+    num_loras = 4
+    num_layers = 1
+    r = 8
+    bs = 32
+    scale = 0.123
+    dtype = getattr(torch, dtype_str)
+    device = torch.device("cuda")
+
+    wa_T_all = torch.randn(num_loras,
+                           num_layers,
+                           r,
+                           h1,
+                           dtype=dtype,
+                           device=device)
+    wb_T_all = torch.randn(num_loras,
+                           num_layers,
+                           h2,
+                           r,
+                           dtype=dtype,
+                           device=device)
+    indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
+
+    for layer_idx in range(num_layers):
+        x = torch.randn(bs, h1, dtype=dtype, device=device)
+        y = torch.randn(bs, h2, dtype=dtype, device=device)
+
+        y_ref = y.clone()
+        _lora_ref_impl(y_ref, x, wa_T_all, wb_T_all, indices, layer_idx, scale)
+
+        y_our = y.clone()
+        punica.add_lora(y_our, x, wa_T_all, wb_T_all, indices, layer_idx,
+                        scale)
+
+        assert_close(y_ref, y_our)
+
+
+@pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"])
+@pytest.mark.parametrize("h1", H1)
+@pytest.mark.parametrize("h2", H2)
+@pytest.mark.parametrize("seed", SEED)
+@torch.inference_mode()
+def test_lora_correctness_slice(dtype_str, h1, h2, seed):
+    if h2 % 3 != 0 or h2 // 3 not in H1:
+        pytest.skip("h2 must be divisible by 3 and in supported shapes")
+    torch.manual_seed(seed)
+    num_loras = 4
+    num_layers = 1
+    r = 8
+    bs = 32
+    scale = 0.123
+    dtype = getattr(torch, dtype_str)
+    device = torch.device("cuda")
+
+    wa_T_all_0 = torch.randn(num_loras,
+                             num_layers,
+                             r,
+                             h1,
+                             dtype=dtype,
+                             device=device)
+    wa_T_all_1 = torch.randn(num_loras,
+                             num_layers,
+                             r,
+                             h1,
+                             dtype=dtype,
+                             device=device)
+    wa_T_all_2 = torch.randn(num_loras,
+                             num_layers,
+                             r,
+                             h1,
+                             dtype=dtype,
+                             device=device)
+    wb_T_all_0 = torch.randn(num_loras,
+                             num_layers,
+                             h2 // 3,
+                             r,
+                             dtype=dtype,
+                             device=device)
+    wb_T_all_1 = torch.randn(num_loras,
+                             num_layers,
+                             h2 // 3,
+                             r,
+                             dtype=dtype,
+                             device=device)
+    wb_T_all_2 = torch.randn(num_loras,
+                             num_layers,
+                             h2 // 3,
+                             r,
+                             dtype=dtype,
+                             device=device)
+
+    indices = torch.randint(num_loras, (bs, ), dtype=torch.long, device=device)
+
+    for layer_idx in range(num_layers):
+        x = torch.randn(bs, h1, dtype=dtype, device=device)
+        y = torch.randn(bs, h2, dtype=dtype, device=device)
+        s = h2 // 3
+
+        y_ref = y.clone()
+        _lora_ref_impl(y_ref[:, :s], x, wa_T_all_0, wb_T_all_0, indices,
+                       layer_idx, scale)
+        _lora_ref_impl(y_ref[:, s:s * 2], x, wa_T_all_1, wb_T_all_1, indices,
+                       layer_idx, scale)
+        _lora_ref_impl(y_ref[:, s * 2:], x, wa_T_all_2, wb_T_all_2, indices,
+                       layer_idx, scale)
+
+        y_our = y.clone()
+        punica.add_lora_slice(y_our, x, wa_T_all_0, wb_T_all_0, indices,
+                              layer_idx, scale, 0, s)
+        punica.add_lora_slice(y_our, x, wa_T_all_1, wb_T_all_1, indices,
+                              layer_idx, scale, s, s)
+        punica.add_lora_slice(y_our, x, wa_T_all_2, wb_T_all_2, indices,
+                              layer_idx, scale, s * 2, s)
+
+        assert_close(y_ref[:, :s], y_our[:, :s])
+        assert_close(y_ref[:, s:s * 2], y_our[:, s:s * 2])
+        assert_close(y_ref[:, s * 2:], y_our[:, s * 2:])
--- a/tests/lora/test_tokenizer.py
+++ b/tests/lora/test_tokenizer.py
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer import TokenizerGroup, get_lora_tokenizer
+
+
+@pytest.mark.asyncio
+async def test_transformers_tokenizer():
+    reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer = TokenizerGroup(
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+    )
+    assert reference_tokenizer.encode("prompt") == tokenizer.encode(
+        request_id="request_id", prompt="prompt", lora_request=None)
+    assert reference_tokenizer.encode(
+        "prompt") == await tokenizer.encode_async(request_id="request_id",
+                                                  prompt="prompt",
+                                                  lora_request=None)
+    assert isinstance(tokenizer.get_lora_tokenizer(None),
+                      PreTrainedTokenizerBase)
+    assert tokenizer.get_lora_tokenizer(
+        None) == await tokenizer.get_lora_tokenizer_async(None)
+
+
+@pytest.mark.asyncio
+async def test_transformers_tokenizer_lora(sql_lora_files):
+    reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
+    tokenizer = TokenizerGroup(
+        tokenizer_id="gpt2",
+        enable_lora=True,
+        max_num_seqs=1,
+        max_input_length=None,
+    )
+    lora_request = LoRARequest("1", 1, sql_lora_files)
+    assert reference_tokenizer.encode("prompt") == tokenizer.encode(
+        request_id="request_id", prompt="prompt", lora_request=lora_request)
+    assert reference_tokenizer.encode(
+        "prompt") == await tokenizer.encode_async(request_id="request_id",
+                                                  prompt="prompt",
+                                                  lora_request=lora_request)
+    assert isinstance(tokenizer.get_lora_tokenizer(None),
+                      PreTrainedTokenizerBase)
+    assert tokenizer.get_lora_tokenizer(
+        None) == await tokenizer.get_lora_tokenizer_async(None)
+
+    assert isinstance(tokenizer.get_lora_tokenizer(lora_request),
+                      PreTrainedTokenizerBase)
+    assert tokenizer.get_lora_tokenizer(
+        lora_request) != tokenizer.get_lora_tokenizer(None)
+    assert tokenizer.get_lora_tokenizer(
+        lora_request) == await tokenizer.get_lora_tokenizer_async(lora_request)
+
+
+def test_get_lora_tokenizer(sql_lora_files, tmpdir):
+    lora_request = None
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert not tokenizer
+
+    lora_request = LoRARequest("1", 1, sql_lora_files)
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert tokenizer.get_added_vocab()
+
+    lora_request = LoRARequest("1", 1, str(tmpdir))
+    tokenizer = get_lora_tokenizer(lora_request)
+    assert not tokenizer
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
+from collections import OrderedDict
+
+from torch import nn
+
+from vllm.utils import LRUCache
+from vllm.lora.utils import (parse_fine_tuned_lora_name, replace_submodule)
+
+
+def test_parse_fine_tuned_lora_name():
+    fixture = {
+        ("base_model.model.lm_head.lora_A.weight", "lm_head", True),
+        ("base_model.model.lm_head.lora_B.weight", "lm_head", False),
+        (
+            "base_model.model.model.embed_tokens.lora_embedding_A",
+            "model.embed_tokens",
+            True,
+        ),
+        (
+            "base_model.model.model.embed_tokens.lora_embedding_B",
+            "model.embed_tokens",
+            False,
+        ),
+        (
+            "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
+            "model.layers.9.mlp.down_proj",
+            True,
+        ),
+        (
+            "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
+            "model.layers.9.mlp.down_proj",
+            False,
+        ),
+    }
+    for name, module_name, is_lora_a in fixture:
+        assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
+
+
+def test_replace_submodule():
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", nn.Linear(764, 100)),
+            ("act1", nn.ReLU()),
+            ("dense2", nn.Linear(100, 50)),
+            (
+                "seq1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", nn.Linear(100, 10)),
+                        ("dense2", nn.Linear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("output", nn.Linear(50, 10)),
+            ("outact", nn.Sigmoid()),
+        ]))
+
+    sigmoid = nn.Sigmoid()
+
+    replace_submodule(model, "act1", sigmoid)
+    assert dict(model.named_modules())["act1"] == sigmoid
+
+    dense2 = nn.Linear(1, 5)
+    replace_submodule(model, "seq1.dense2", dense2)
+    assert dict(model.named_modules())["seq1.dense2"] == dense2
+
+
+class TestLRUCache(LRUCache):
+
+    def _on_remove(self, key, value):
+        if not hasattr(self, "_remove_counter"):
+            self._remove_counter = 0
+        self._remove_counter += 1
+
+
+def test_lru_cache():
+    cache = TestLRUCache(3)
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(1, 1)
+    assert len(cache) == 1
+
+    cache.put(2, 2)
+    assert len(cache) == 2
+
+    cache.put(3, 3)
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache.put(4, 4)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+    assert cache.get(2) == 2
+
+    cache.put(5, 5)
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    assert cache.pop(5) == 5
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.get(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.put(6, 6)
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
+
+    cache.remove_oldest()
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 6}
+    assert cache._remove_counter == 4
+
+    cache.clear()
+    assert len(cache) == 0
+    assert cache._remove_counter == 6
+
+    cache._remove_counter = 0
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[1] = 1
+    assert len(cache) == 1
+
+    cache[2] = 2
+    assert len(cache) == 2
+
+    cache[3] = 3
+    assert len(cache) == 3
+    assert set(cache.cache) == {1, 2, 3}
+
+    cache[4] = 4
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 3, 4}
+    assert cache._remove_counter == 1
+    assert cache[2] == 2
+
+    cache[5] = 5
+    assert set(cache.cache) == {2, 4, 5}
+    assert cache._remove_counter == 2
+
+    del cache[5]
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache.pop(10)
+    assert len(cache) == 2
+    assert set(cache.cache) == {2, 4}
+    assert cache._remove_counter == 3
+
+    cache[6] = 6
+    assert len(cache) == 3
+    assert set(cache.cache) == {2, 4, 6}
+    assert 2 in cache
+    assert 4 in cache
+    assert 6 in cache
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
+import os
+import random
+import tempfile
+from unittest.mock import patch
+
+from vllm.lora.models import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig
+from vllm.worker.worker import Worker
+
+
+@patch.dict(os.environ, {"RANK": "0"})
+def test_worker_apply_lora(sql_lora_files):
+    worker = Worker(
+        model_config=ModelConfig(
+            "meta-llama/Llama-2-7b-hf",
+            "meta-llama/Llama-2-7b-hf",
+            tokenizer_mode="auto",
+            trust_remote_code=False,
+            download_dir=None,
+            load_format="dummy",
+            seed=0,
+            dtype="float16",
+            revision=None,
+        ),
+        parallel_config=ParallelConfig(1, 1, False),
+        scheduler_config=SchedulerConfig(32, 32, 32, 256),
+        local_rank=0,
+        rank=0,
+        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
+                               max_loras=32),
+        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
+    )
+    worker.init_model()
+    worker.load_model()
+
+    worker.model_runner.set_active_loras([], LoRAMapping([], []))
+    assert worker.list_loras() == set()
+
+    n_loras = 32
+    lora_requests = [
+        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
+    ]
+
+    worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
+    assert worker.list_loras() == {
+        lora_request.lora_int_id
+        for lora_request in lora_requests
+    }
+
+    for i in range(32):
+        random.seed(i)
+        iter_lora_requests = random.choices(lora_requests,
+                                            k=random.randint(1, n_loras))
+        random.shuffle(iter_lora_requests)
+        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
+        worker.model_runner.set_active_loras(iter_lora_requests,
+                                             LoRAMapping([], []))
+        assert worker.list_loras().issuperset(
+            {lora_request.lora_int_id
+             for lora_request in iter_lora_requests})
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
+from typing import List, Optional
+
+import torch
+
+from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
+
+
+class DummyLoRAManager:
+
+    def __init__(self):
+        super().__init__()
+        self._loras = {}
+
+    def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
+        self._loras[module_name] = lora
+
+    def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
+        return self._loras.get(module_name, None)
+
+    def init_random_lora(self,
+                         module_name: str,
+                         weight: torch.Tensor,
+                         rank: int = 8,
+                         generate_embeddings_tensor: int = 0):
+        lora = LoRALayerWeights(
+            module_name,
+            rank=rank,
+            lora_alpha=1,
+            lora_a=torch.rand([weight.shape[1], rank],
+                              dtype=weight.dtype,
+                              device="cuda"),
+            lora_b=torch.rand([rank, weight.shape[0]],
+                              dtype=weight.dtype,
+                              device="cuda"),
+        )
+        if generate_embeddings_tensor:
+            lora.embeddings_tensor = torch.rand(5,
+                                                generate_embeddings_tensor,
+                                                dtype=weight.dtype,
+                                                device="cuda")
+        self.set_module_lora(module_name, lora)
+
+        return lora
+
+    def init_lora(self,
+                  module_name: str,
+                  input_dim: int,
+                  output_dim: int,
+                  rank=8,
+                  noop=False,
+                  embeddings_tensor=None):
+        lora = LoRALayerWeights(
+            module_name,
+            rank=rank,
+            lora_alpha=1,
+            lora_a=torch.rand([input_dim, rank], device="cuda"),
+            lora_b=torch.rand([rank, output_dim], device="cuda"),
+            embeddings_tensor=embeddings_tensor,
+        )
+        self.set_module_lora(module_name, lora)
+        return lora
+
+    def reset_lora(self):
+        self._loras = {}
+
+    def init_packed_lora(
+        self,
+        module_name: str,
+        input_dim: int,
+        output_dims: List[int],
+        noop_lora_index: List[int] = None,
+        rank=8,
+    ):
+        base_loras = []
+        noop_lora_index = set(noop_lora_index or [])
+
+        for i, out_dim in enumerate(output_dims):
+            base_lora = self.init_lora(
+                module_name + "_000_" + str(i),
+                input_dim,
+                out_dim,
+                rank=rank,
+                noop=i in noop_lora_index,
+            )
+            base_loras.append(base_lora)
+        packed_lora = PackedLoRALayerWeights.pack(base_loras)
+        self.set_module_lora(module_name, packed_lora)
+        return packed_lora
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -5,18 +5,11 @@ Run `pytest tests/models/test_models.py --forked`.
 import pytest

 MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
-    "mistralai/Mistral-7B-v0.1",
-    "Deci/DeciLM-7b",
-    "tiiuae/falcon-7b",
-    "gpt2",
-    "bigcode/tiny_starcoder_py",
-    "EleutherAI/gpt-j-6b",
-    "EleutherAI/pythia-70m",
-    "bigscience/bloom-560m",
-    "mosaicml/mpt-7b",
-    "microsoft/phi-2",
+    "facebook/opt-125m", "meta-llama/Llama-2-7b-hf",
+    "mistralai/Mistral-7B-v0.1", "Deci/DeciLM-7b", "tiiuae/falcon-7b", "gpt2",
+    "bigcode/tiny_starcoder_py", "EleutherAI/gpt-j-6b",
+    "EleutherAI/pythia-70m", "bigscience/bloom-560m", "mosaicml/mpt-7b",
+    "microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"
 ]



--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
+"""Compare the with and without prefix caching.
+
+Run `pytest tests/prefix_caching/test_prefix_caching.py`.
+"""
+import pytest
+
+from vllm import LLM, SamplingParams
+
+prefix = (
+    "You are an expert school principal, skilled in effectively managing "
+    "faculty and staff. Draft 10-15 questions for a potential first grade "
+    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
+    "community, joyful discovery, and life-long learning. The candidate is "
+    "coming in for a first-round panel interview for a 8th grade Math "
+    "teaching role. They have 5 years of previous teaching experience "
+    "as an assistant teacher at a co-ed, public school with experience "
+    "in middle school math teaching. Based on these information, fulfill "
+    "the following paragraph: ")
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("max_tokens", [16])
+def test_prefix_caching(
+    example_prompts,
+    model: str,
+    max_tokens: int,
+):
+    llm = LLM(model=model)
+    # -1 since the last token can change when concatenating prompts.
+    prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
+    prompts = [prefix + prompt for prompt in example_prompts]
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+    outputs_without_prefix = llm.generate(prompts, sampling_params)
+    outputs_with_prefix = llm.generate(prompts,
+                                       sampling_params,
+                                       prefix_pos=[prefix_pos] * len(prompts))
+    for output_without_prefix, output_with_prefix in zip(
+            outputs_without_prefix, outputs_with_prefix):
+        assert (output_without_prefix.outputs[0].token_ids ==
+                output_with_prefix.outputs[0].token_ids)
+    assert len(llm.llm_engine.scheduler.prefix_pool.prefixes) == 1