merge v0.3.1

7e1d5e53 · zhuwenwen · e3378b20 · 5f08050d · 7e1d5e53 · 7e1d5e53
Commit 7e1d5e53 authored Feb 19, 2024 by zhuwenwen
20 changed files
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -6,6 +6,7 @@ import torch
 from typing import Tuple

 from vllm._C import cache_ops
+from vllm.utils import is_hip

 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -14,10 +15,15 @@ NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [8, 16, 32]
-NUM_BLOCKS = [1024, 3600]  # Arbitrary values for testing
+# reduce the size for ROCm test to avoid HIP OOM
+NUM_BLOCKS = [1024, 36000] if not is_hip else [
+    1024, 10000
+]  # Arbitrary values for testing
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]


@@ -29,7 +35,7 @@ KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_copy_blocks(
@@ -42,13 +48,14 @@ def test_copy_blocks(
    num_blocks: int,
    dtype: torch.dtype,
    seed: int,
-    device: int,
    kv_cache_dtype: str,
+    device: str,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
    # Generate random block mappings where each source block is mapped to two
    # destination blocks.
    assert 2 * num_mappings <= num_blocks
@@ -66,7 +73,7 @@ def test_copy_blocks(
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
                                                num_layers, num_heads,
                                                head_size, kv_cache_dtype,
-                                                dtype, seed, gpu_id)
+                                                dtype, seed, device)

    # Clone the KV caches.
    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
@@ -98,7 +105,7 @@ def test_copy_blocks(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_reshape_and_cache(
    kv_cache_factory,
@@ -109,29 +116,25 @@ def test_reshape_and_cache(
    num_blocks: int,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
    # Create a random slot mapping.
    num_slots = block_size * num_blocks
    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=gpu_id)
-
-    qkv = torch.randn(num_tokens,
-                      3,
-                      num_heads,
-                      head_size,
-                      dtype=dtype,
-                      device=gpu_id)
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
+
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
    _, key, value = qkv.unbind(dim=1)

    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
                                                num_heads, head_size, dtype,
-                                                None, seed, gpu_id)
+                                                None, seed, device)
    key_cache, value_cache = key_caches[0], value_caches[0]

    # Clone the KV caches.
@@ -166,7 +169,7 @@ def test_reshape_and_cache(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_swap_blocks(
    kv_cache_factory,
@@ -178,15 +181,15 @@ def test_swap_blocks(
    num_blocks: int,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    src_device = f"{direction[0]}:{device}" if direction[
-        0] == "cuda" else direction[0]
-    dst_device = f"{direction[1]}:{device}" if direction[
-        1] == "cuda" else direction[1]
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+
+    src_device = device if direction[0] == "cuda" else 'cpu'
+    dst_device = device if direction[1] == "cuda" else 'cpu'

    src_blocks = random.sample(range(num_blocks), num_mappings)
    # For the same device, mapping must not overlap
@@ -200,12 +203,12 @@ def test_swap_blocks(

    # Create the KV caches on the first device.
    src_key_caches, src_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, dtype, seed,
+        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
        src_device)

    # Create the KV caches on the second device.
    dist_key_caches, dist_value_caches = kv_cache_factory(
-        num_blocks, block_size, 1, num_heads, head_size, dtype, seed,
+        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
        dst_device)

    src_key_caches_clone = src_key_caches[0].clone()

--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -8,7 +8,9 @@ NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
 HIDDEN_SIZES = [768, 5120, 8192]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]


 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -16,7 +18,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rms_norm(
    num_tokens: int,
@@ -24,15 +26,16 @@ def test_rms_norm(
    add_residual: bool,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
-    layer = RMSNorm(hidden_size).to(dtype=dtype, device=gpu_id)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
    layer.weight.data.normal_(mean=1.0, std=0.1)
    scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=gpu_id)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    x *= scale
    residual = torch.randn_like(x) * scale if add_residual else None


--- a/tests/kernels/test_fused_moe.py
+++ b/tests/kernels/test_fused_moe.py
+"""Tests for the MOE layers.
+
+Run `pytest tests/kernels/test_moe.py`.
+"""
 import pytest
 import torch
+from transformers import MixtralConfig
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock

 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.models.mixtral import MixtralMoE


-def torch_moe(a, w1, w2, topk_weight, topk_ids):
+def torch_moe(a, w1, w2, score, topk):
    B, D = a.shape
-    a = a.view(B, -1, D).repeat(1, topk_ids.shape[1], 1).reshape(-1, D)
-    out = torch.zeros(B * topk_ids.shape[1],
-                      w2.shape[1],
-                      dtype=a.dtype,
-                      device=a.device)
-    topk_ids = topk_ids.view(-1)
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
    for i in range(w1.shape[0]):
        mask = topk_ids == i
        if mask.sum():
            out[mask] = SiluAndMul()(
                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
    return (out.view(B, -1, w2.shape[1]) *
-            topk_weight.view(B, -1, 1)).sum(dim=1)
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)


 @pytest.mark.parametrize("m", [512, 222, 33, 1])
@@ -42,9 +48,51 @@ def test_fused_moe(
    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10

    score = torch.randn((m, e), device='cuda', dtype=dtype)
-    score = torch.softmax(score, dim=-1)
-    topk_weight, topk_ids = torch.topk(score, topk)
-
-    triton_output = fused_moe(a, w1, w2, topk_weight, topk_ids, False)
-    torch_output = torch_moe(a, w1, w2, topk_weight, topk_ids)
+    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
+    torch_output = torch_moe(a, w1, w2, score, topk)
    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_mixtral_moe(dtype: torch.dtype):
+    "Make sure our Mixtral MoE implementation agrees with the one from huggingface."
+
+    # Instantiate our and huggingface's MoE blocks
+    config = MixtralConfig()
+    hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
+    vllm_moe = MixtralMoE(
+        num_experts=config.num_local_experts,
+        top_k=config.num_experts_per_tok,
+        hidden_size=config.hidden_size,
+        intermediate_size=config.intermediate_size,
+        params_dtype=dtype,
+        tp_size=1,
+    ).cuda()
+
+    # Load the weights
+    vllm_moe.gate.linear_weights["weight"][:] = hf_moe.gate.weight.data
+    for i in range(config.num_local_experts):
+        weights = (hf_moe.experts[i].w1.weight.data,
+                   hf_moe.experts[i].w3.weight.data)
+        vllm_moe.ws[i][:] = torch.cat(weights, dim=0)
+        vllm_moe.w2s[i][:] = hf_moe.experts[i].w2.weight.data
+
+    # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
+    inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
+
+    # Run forward passes for both MoE blocks
+    hf_states, _ = hf_moe.forward(inputs)
+    vllm_states = vllm_moe.forward(inputs)
+
+    mixtral_moe_tol = {
+        torch.float32: 1e-3,
+        torch.float16: 1e-3,
+        torch.bfloat16: 1e-2,
+    }
+
+    assert torch.allclose(hf_states,
+                          vllm_states,
+                          rtol=mixtral_moe_tol[dtype],
+                          atol=mixtral_moe_tol[dtype])
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -2,7 +2,7 @@ from typing import Optional

 import pytest
 import torch
-
+from allclose_default import get_default_atol, get_default_rtol
 from vllm.model_executor.layers.rotary_embedding import get_rope

 IS_NEOX_STYLE = [True, False]
@@ -13,7 +13,9 @@ NUM_HEADS = [7, 17]  # Arbitrary values for testing
 BATCH_SIZES = [1, 5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]


 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@@ -24,7 +26,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rotary_embedding(
    is_neox_style: bool,
@@ -35,28 +37,26 @@ def test_rotary_embedding(
    rotary_dim: Optional[int],
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
    max_position: int = 8192,
    base: int = 10000,
 ) -> None:
    if rotary_dim is None:
        rotary_dim = head_size
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
-    rope = rope.to(dtype=dtype, device=gpu_id)
+    rope = rope.to(dtype=dtype)

-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device=gpu_id)
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
    query = torch.randn(batch_size,
                        seq_len,
                        num_heads * head_size,
-                        dtype=dtype,
-                        device=gpu_id)
+                        dtype=dtype)
    key = torch.randn_like(query)

    # NOTE(woosuk): The reference implementation should be executed first
@@ -64,5 +64,11 @@ def test_rotary_embedding(
    ref_query, ref_key = rope._forward(positions, query, key)
    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
-    assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
-    assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
+    assert torch.allclose(out_query,
+                          ref_query,
+                          atol=get_default_atol(out_query),
+                          rtol=get_default_rtol(out_query))
+    assert torch.allclose(out_key,
+                          ref_key,
+                          atol=get_default_atol(out_key),
+                          rtol=get_default_rtol(out_key))
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -11,19 +11,27 @@ from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 NUM_HEADS = [12]
 HEAD_SIZES = [128]
 DTYPES = [torch.float16]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]


 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_contexted_kv_attention(
    num_heads: int,
    head_size: int,
    dtype: torch.dtype,
+    device: str,
 ) -> None:
    random.seed(0)
    torch.manual_seed(0)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(0)
+    torch.set_default_device(device)
    MAX_SEQ_LEN = 1024
    MAX_CTX_LEN = 1024
    BS = 10
@@ -35,24 +43,11 @@ def test_contexted_kv_attention(
    seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]

    num_tokens = sum(subquery_lens)
-    query = torch.empty(num_tokens,
-                        num_heads,
-                        head_size,
-                        dtype=dtype,
-                        device='cuda')
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
    query.uniform_(-1e-3, 1e-3)
-    output = torch.empty(num_tokens,
-                         num_heads,
-                         head_size,
-                         dtype=dtype,
-                         device='cuda')
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)

-    kv = torch.empty(sum(seq_lens),
-                     2,
-                     num_heads,
-                     head_size,
-                     dtype=dtype,
-                     device='cuda')
+    kv = torch.empty(sum(seq_lens), 2, num_heads, head_size, dtype=dtype)
    kv.uniform_(-1e-3, 1e-3)
    key, value = kv.unbind(dim=1)

@@ -60,39 +55,27 @@ def test_contexted_kv_attention(
                          block_size,
                          num_heads,
                          head_size,
-                          dtype=dtype,
-                          device='cuda')
+                          dtype=dtype)
    v_cache = torch.zeros(cache_size,
                          block_size,
                          num_heads,
                          head_size,
-                          dtype=dtype,
-                          device='cuda')
-    k = torch.zeros(sum(subquery_lens),
-                    num_heads,
-                    head_size,
-                    dtype=dtype,
-                    device='cuda')
-    v = torch.zeros(sum(subquery_lens),
-                    num_heads,
-                    head_size,
-                    dtype=dtype,
-                    device='cuda')
-    values = torch.arange(0, cache_size, dtype=torch.long, device='cuda')
+                          dtype=dtype)
+    k = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
    values = values[torch.randperm(cache_size)]
    block_table = values[:BS * max_block_per_request].view(
        BS, max_block_per_request)
-    b_seq_len = torch.tensor(seq_lens, dtype=torch.long, device='cuda')
-    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long, device='cuda')
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
    b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
-                                            dtype=torch.long,
-                                            device='cuda'),
+                                            dtype=torch.long),
                               dim=0)
    max_input_len = MAX_SEQ_LEN
    # copy kv to cache
    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
-                                                dtype=torch.long,
-                                                device='cuda'),
+                                                dtype=torch.long),
                                   dim=0)
    for i in range(BS):
        for j in range(subquery_lens[i]):

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -121,13 +121,18 @@ def sql_lora_files():
    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")


+@pytest.fixture(scope="session")
+def mixtral_lora_files():
+    return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
+
+
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings() -> nn.Module:
    cleanup()
    get_model_old = get_model

-    def get_model_patched(model_config, lora_config=None):
-        return get_model_old(model_config,
+    def get_model_patched(model_config, device_config, lora_config=None):
+        return get_model_old(model_config, device_config,
                             LoRAConfig(max_loras=4, max_lora_rank=8))

    with patch("vllm.worker.model_runner.get_model", get_model_patched):

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -34,6 +34,9 @@ TOLERANCES = {
    torch.float32: (5e-3, 5e-3),
    torch.bfloat16: (3e-2, 2e-2),
 }
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]


 def get_random_id_to_index(num_loras: int,
@@ -151,14 +154,10 @@ def create_random_inputs(
    for _ in range(num_inputs):
        if input_type == torch.int:
            inputs.append(
-                torch.randint(low=int(low),
-                              high=int(high),
-                              size=input_size,
-                              device="cuda"))
+                torch.randint(low=int(low), high=int(high), size=input_size))
        else:
            inputs.append(
-                torch.rand(size=input_size, dtype=input_type, device="cuda") *
-                high + low)
+                torch.rand(size=input_size, dtype=input_type) * high + low)

        lora_id = random.choice(active_lora_ids)
        index_mapping += [lora_id] * input_size[0]
@@ -169,8 +168,10 @@ def create_random_inputs(

 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-def test_embeddings(dist_init, num_loras) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_embeddings(dist_init, num_loras, device) -> None:

+    torch.set_default_device(device)
    max_loras = 8
    lora_config = LoRAConfig(max_loras=max_loras,
                             max_lora_rank=8,
@@ -259,8 +260,10 @@ def test_embeddings(dist_init, num_loras) -> None:
 @torch.inference_mode()
 # @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:

+    torch.set_default_device(device)
    max_loras = 8
    lora_config = LoRAConfig(max_loras=max_loras,
                             max_lora_rank=8,
@@ -305,8 +308,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:

        # Add empty embeddings_tensors for unoccupied lora slots.
        for _ in range(max_loras - len(embeddings_tensors)):
-            embeddings_tensors.append(
-                torch.zeros(embeddings_tensors[0].shape, device="cuda"))
+            embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))

        inputs, index_mapping, prompt_mapping = create_random_inputs(
            active_lora_ids=list(lora_dict.keys()),
@@ -388,8 +390,10 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras) -> None:

 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
-def test_lm_head_sampler(dist_init, num_loras) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_lm_head_sampler(dist_init, num_loras, device) -> None:

+    torch.set_default_device(device)
    max_loras = 8
    lora_config = LoRAConfig(max_loras=max_loras,
                             max_lora_rank=8,
@@ -432,7 +436,7 @@ def test_lm_head_sampler(dist_init, num_loras) -> None:
        )
        lora_mapping = LoRAMapping(index_mapping, prompt_mapping)

-        input_ = torch.rand(20, 1024, device="cuda")
+        input_ = torch.rand(20, 1024)
        mapping_info = convert_mapping(
            lora_mapping,
            id_to_index,
@@ -500,8 +504,10 @@ def test_lm_head_sampler(dist_init, num_loras) -> None:
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("orientation", ["row", "column"])
-def test_linear_parallel(dist_init, num_loras, orientation) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:

+    torch.set_default_device(device)
    max_loras = 8
    lora_config = LoRAConfig(max_loras=max_loras,
                             max_lora_rank=8,
@@ -597,8 +603,10 @@ def test_linear_parallel(dist_init, num_loras, orientation) -> None:
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("repeats", [2, 3])
-def test_column_parallel_packed(dist_init, num_loras, repeats) -> None:
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:

+    torch.set_default_device(device)
    max_loras = 8
    lora_config = LoRAConfig(max_loras=max_loras,
                             max_lora_rank=8,

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -11,25 +11,35 @@ from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                              RowParallelLinearWithLoRA,
                              MergedColumnParallelLinearWithLoRA)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (EMBEDDING_MODULES, LoRAModel, LoRAModelManager,
+from vllm.lora.models import (LoRAModel, LoRAModelManager,
                              LRUCacheLoRAModelManager, LoRAMapping)
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                      WorkerLoRAManager)
 from vllm.model_executor.layers.linear import RowParallelLinear

+EMBEDDING_MODULES = {
+    "embed_tokens": "input_embeddings",
+    "lm_head": "output_embeddings",
+}
+
+EMBEDDING_PADDING_MODULES = ["lm_head"]
+

 def test_from_lora_tensors(sql_lora_files):
    tensors = load_file(
        os.path.join(sql_lora_files, "adapter_model.safetensors"))
    new_embeddings = load_file(
        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
-    lora_model = LoRAModel.from_lora_tensors(1,
-                                             8,
-                                             16,
-                                             tensors,
-                                             "cuda",
-                                             embeddings=new_embeddings)
+    lora_model = LoRAModel.from_lora_tensors(
+        1,
+        8,
+        16,
+        tensors,
+        "cuda",
+        embeddings=new_embeddings,
+        embedding_modules=EMBEDDING_MODULES,
+        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
    for module_name, lora in lora_model.loras.items():
        assert lora.module_name == module_name
        assert lora.rank == 8
@@ -90,14 +100,11 @@ def create_packed_lora(

 def test_replace_submodules(dist_init, dummy_model):
    model = dummy_model
-    manager = LoRAModelManager(model,
-                               1,
-                               1,
-                               1,
-                               LoRAConfig(max_lora_rank=8,
-                                          max_cpu_loras=8,
-                                          max_loras=8),
-                               lora_target_modules=["dense1", "layer1.dense2"])
+    model.supported_lora_modules = ["dense1", "layer1.dense2"]
+    model.packed_modules_mapping = {}
+    manager = LoRAModelManager(
+        model, 1, 1, 1,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8))
    model = manager.model

    assert isinstance(model.get_submodule("dense1"),
@@ -111,16 +118,14 @@ def test_replace_submodules(dist_init, dummy_model):

 def test_lora_model_manager(dist_init, dummy_model):
    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
    manager = LoRAModelManager(
-        model,
-        2,
-        2,
-        2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2),
-        lora_target_modules=["dense1", "dense2", "lm_head"])
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
    assert all(x is None for x in manager.lora_index_to_id)
    assert manager.add_lora(model_lora1)
    assert manager.activate_lora(1)
@@ -159,16 +164,14 @@ def test_lora_model_manager(dist_init, dummy_model):

 def test_lora_lru_cache_model_manager(dist_init, dummy_model):
    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
    manager = LRUCacheLoRAModelManager(
-        model,
-        2,
-        2,
-        2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2),
-        lora_target_modules=["dense1", "dense2", "lm_head"])
+        model, 2, 2, 2,
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
    assert all(x is None for x in manager.lora_index_to_id)
    assert manager.add_lora(model_lora1)
    assert manager.activate_lora(1)
@@ -212,14 +215,15 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
    # This tests just the LRU cache functionality, everything else is
    # tested in test_lora_model_manager
    model = dummy_model
+    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
+    model.packed_modules_mapping = {}
    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
    manager = LRUCacheLoRAModelManager(
        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2),
-        ["dense1", "dense2", "lm_head"])
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))

    assert all(x is None for x in manager.lora_index_to_id)

@@ -289,8 +293,9 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings,
                                       sql_lora_files):
    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
    worker_lora_manager = LRUCacheWorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.config.vocab_size, lora_config,
-        torch.device("cuda"))
+        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)

    mapping = LoRAMapping([], [])
@@ -362,8 +367,9 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,
    # Should remove every LoRA not specified in the request.
    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
    worker_lora_manager = WorkerLoRAManager(
-        4, 2, llama_2_7b_model_extra_embeddings.config.vocab_size, lora_config,
-        torch.device("cuda"))
+        4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
+        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
+        EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
    worker_lora_manager.create_lora_manager(llama_2_7b_model_extra_embeddings)

    mapping = LoRAMapping([], [])
@@ -428,6 +434,13 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings,

 def test_packed_loras(dist_init, dummy_model_gate_up):
    model = dummy_model_gate_up
+    model.supported_lora_modules = ["gate_up_proj"]
+    model.packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
    model_lora = create_packed_lora(
        1,
        model,
@@ -443,8 +456,7 @@ def test_packed_loras(dist_init, dummy_model_gate_up):

    manager = LoRAModelManager(
        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2),
-        ["gate_up_proj"])
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
    model = manager.model

    assert isinstance(model.get_submodule("gate_up_proj"),

--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
+import pytest
+import torch
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+
+def do_sample(llm, lora_path: str, lora_id: int):
+    prompts = [
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.mark.parametrize("tp_size", [4])
+def test_mixtral_lora(mixtral_lora_files, tp_size):
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=tp_size,
+                   worker_use_ray=True)
+
+    expected_lora_output = [
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
+        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",
+    ]
+
+    assert do_sample(llm, mixtral_lora_files,
+                     lora_id=1) == expected_lora_output
+    assert do_sample(llm, mixtral_lora_files,
+                     lora_id=2) == expected_lora_output
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -5,7 +5,8 @@ from unittest.mock import patch

 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
-from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig
+from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
+                         DeviceConfig, LoRAConfig)
 from vllm.worker.worker import Worker


@@ -25,6 +26,7 @@ def test_worker_apply_lora(sql_lora_files):
        ),
        parallel_config=ParallelConfig(1, 1, False),
        scheduler_config=SchedulerConfig(32, 32, 32, 256),
+        device_config=DeviceConfig("cuda"),
        local_rank=0,
        rank=0,
        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -9,6 +9,10 @@ from vllm.model_executor.utils import set_random_seed

 from vllm.model_executor.layers.rejection_sampler import RejectionSampler

+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+

 def mock_causal_accepted_tensor(
        k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor:
@@ -39,11 +43,14 @@ def mock_causal_accepted_tensor(
 @pytest.mark.parametrize(
    "which_tokens_accepted",
    ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_correct_output_format(which_tokens_accepted: str, seed: int):
+def test_correct_output_format(which_tokens_accepted: str, seed: int,
+                               device: str):
    """Verify the output has correct format given predetermined accepted matrix.
    """
    set_random_seed(seed)
+    torch.set_default_device(device)

    batch_size = 10
    k = 5
@@ -66,18 +73,15 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int):
    recovered_token_ids = torch.randint(low=0,
                                        high=vocab_size,
                                        size=(batch_size, k),
-                                        dtype=torch.int64,
-                                        device="cuda")
+                                        dtype=torch.int64)
    draft_token_ids = torch.randint(low=0,
                                    high=vocab_size,
                                    size=(batch_size, k),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
    bonus_token_ids = torch.randint(low=0,
                                    high=vocab_size,
                                    size=(batch_size, 1),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)

    rejection_sampler = RejectionSampler()
    rejection_sampler.init_gpu_tensors(rank=0)
@@ -120,31 +124,24 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int):
 @pytest.mark.parametrize("k", list(range(1, 6)))
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", list(range(1, 32)))
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int):
+def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
+                                    device: str):
+    torch.set_default_device(device)
    rejection_sampler = RejectionSampler()
    rejection_sampler.init_gpu_tensors(rank=0)

-    draft_probs = torch.rand(batch_size,
-                             k,
-                             vocab_size,
-                             dtype=torch.float32,
-                             device="cuda")
-    target_probs = torch.rand(batch_size,
-                              k,
-                              vocab_size,
-                              dtype=torch.float32,
-                              device="cuda")
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    bonus_token_ids = torch.randint(low=0,
                                    high=vocab_size,
                                    size=(batch_size, 1),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
    draft_token_ids = torch.randint(low=0,
                                    high=vocab_size,
                                    size=(batch_size, k),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)

    rejection_sampler(target_probs, bonus_token_ids, draft_probs,
                      draft_token_ids)
@@ -153,36 +150,28 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int):
 @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"])
 @pytest.mark.parametrize("which_token_ids",
                         ["bonus_token_ids", "draft_token_ids"])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
-                               which_token_ids: str):
+                               which_token_ids: str, device: str):
    k = 3
    batch_size = 5
    vocab_size = 30_000
+    torch.set_default_device(device)

    rejection_sampler = RejectionSampler(strict_mode=True)
    rejection_sampler.init_gpu_tensors(rank=0)

-    draft_probs = torch.rand(batch_size,
-                             k,
-                             vocab_size,
-                             dtype=torch.float32,
-                             device="cuda")
-    target_probs = torch.rand(batch_size,
-                              k,
-                              vocab_size,
-                              dtype=torch.float32,
-                              device="cuda")
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
    bonus_token_ids = torch.randint(low=0,
                                    high=vocab_size,
                                    size=(batch_size, 1),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)
    draft_token_ids = torch.randint(low=0,
                                    high=vocab_size,
                                    size=(batch_size, k),
-                                    dtype=torch.int64,
-                                    device="cuda")
+                                    dtype=torch.int64)

    oob_token_ids = None
    if which_token_ids == "bonus_token_ids":
@@ -237,6 +226,7 @@ def test_rejection_sampling_approximates_target_distribution(
    probabilities are exactly equal. Rejection sampling should
    still work without any NaNs or exceptions.
    """
+    torch.set_default_device("cpu")
    set_random_seed(seed)

    helper = _CorrectnessTestHelper(

--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -31,24 +31,26 @@ def _prepare_test(
    batch_size: int
 ) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024),
-                              device="cuda",
-                              dtype=torch.float16)
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
    fake_logits = torch.full((batch_size, vocab_size),
                             1e-2,
-                             device=input_tensor.device,
                             dtype=input_tensor.dtype)
    sampler = MockLogitsSampler(32000, fake_logits)
-    model_runner = ModelRunner(None, None, None, None)
+    model_runner = ModelRunner(None, None, None, None, None)
    return input_tensor, fake_logits, sampler, model_runner


 RANDOM_SEEDS = list(range(128))
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_all_greedy(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_greedy(seed: int, device: str):
    set_random_seed(seed)
+    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
        batch_size)
@@ -81,8 +83,10 @@ def test_sampler_all_greedy(seed: int):


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_all_random(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random(seed: int, device: str):
    set_random_seed(seed)
+    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
        batch_size)
@@ -120,8 +124,10 @@ def test_sampler_all_random(seed: int):


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_all_beam(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_beam(seed: int, device: str):
    set_random_seed(seed)
+    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
    input_tensor, _, sampler, model_runner = _prepare_test(batch_size)

@@ -156,8 +162,10 @@ def test_sampler_all_beam(seed: int):


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_mixed(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_mixed(seed: int, device: str):
    set_random_seed(seed)
+    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
        batch_size)
@@ -212,8 +220,10 @@ def test_sampler_mixed(seed: int):


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_logits_processors(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_logits_processors(seed: int, device: str):
    set_random_seed(seed)
+    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
    input_tensor, _, sampler, model_runner = _prepare_test(batch_size)

@@ -252,14 +262,15 @@ def test_sampler_logits_processors(seed: int):


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_sampler_top_k_top_p(seed: int):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_top_k_top_p(seed: int, device: str):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
    top_k = random.randint(100, 500)
    top_p = random.random() * 0.1
    vocab_size = 32000
    input_tensor = torch.rand((batch_size, 1024),
-                              device="cuda",
+                              device=device,
                              dtype=torch.float16)
    fake_logits = torch.normal(0,
                               5,
@@ -267,7 +278,7 @@ def test_sampler_top_k_top_p(seed: int):
                               device=input_tensor.device,
                               dtype=input_tensor.dtype)
    sampler = MockLogitsSampler(32000, fake_logits)
-    model_runner = ModelRunner(None, None, None, None)
+    model_runner = ModelRunner(None, None, None, None, None)

    generation_model = GenerationMixin()
    generation_config = GenerationConfig(top_k=top_k,

--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -4,6 +4,10 @@ It should include tests that are reported by users and making sure they
 will never happen again.

 """
+import gc
+
+import torch
+
 from vllm import LLM, SamplingParams


@@ -35,6 +39,20 @@ def test_max_tokens_none():
    assert len(prompts) == len(outputs)


+def test_gc():
+    llm = LLM("facebook/opt-125m", enforce_eager=True)
+    del llm
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # The memory allocated for model and KV cache should be released.
+    # The memory allocated for PyTorch and others should be less than 50MB.
+    # Usually, it's around 10MB.
+    allocated = torch.cuda.memory_allocated()
+    assert allocated < 50 * 1024 * 1024
+
+
 if __name__ == "__main__":
    import pytest
    pytest.main([__file__])
--- a/tests/worker/spec_decode/utils.py
+++ b/tests/worker/spec_decode/utils.py
@@ -84,7 +84,7 @@ def create_worker(cls: type,
    )

    (model_config, cache_config, parallel_config, scheduler_config,
-     _) = engine_args.create_engine_configs()
+     device_config, _) = engine_args.create_engine_configs()

    distributed_init_method = get_distributed_init_method(
        get_ip(), get_open_port())
@@ -93,6 +93,7 @@ def create_worker(cls: type,
        model_config=model_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
+        device_config=device_config,
        local_rank=0,
        rank=0,
        distributed_init_method=distributed_init_method,

--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -6,7 +6,7 @@ from vllm.worker.model_runner import ModelRunner


 def test_prepare_prompt():
-    model_runner = ModelRunner(None, None, None, None)
+    model_runner = ModelRunner(None, None, None, None, None)
    model_runner.set_block_size(16)

    batch_size = random.randint(1, 256)

--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -9,7 +9,7 @@ from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.version import __dcu_version__

-__version__ = "0.3.0"
+__version__ = "0.3.1"

 __all__ = [
    "LLM",

--- a/vllm/config.py
+++ b/vllm/config.py
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -296,6 +296,8 @@ class AsyncLLMEngine:
            async frontend will be executed in a separate process as the
            model workers.
        log_requests: Whether to log the requests.
+        max_log_len: Maximum number of prompt characters or prompt ID numbers
+            being printed in log.
        start_engine_loop: If True, the background task to run the engine
            will be automatically started in the generate call.
        *args: Arguments for LLMEngine.
@@ -431,8 +433,8 @@ class AsyncLLMEngine:
            logger.info(f"Received request {request_id}: "
                        f"prompt: {shortened_prompt!r}, "
                        f"prefix_pos: {prefix_pos},"
-                        f"sampling params: {sampling_params}, "
-                        f"prompt token ids: {shortened_token_ids}, "
+                        f"sampling_params: {sampling_params}, "
+                        f"prompt_token_ids: {shortened_token_ids}, "
                        f"lora_request: {lora_request}.")

        if not self.is_running:

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py