Merge branch 'vllm-v0.2.7-dtk23.10'

fcffb7c8 · zhuwenwen · eb181638 · 4095d0db · fcffb7c8 · fcffb7c8
Commit fcffb7c8 authored Jan 16, 2024 by zhuwenwen
20 changed files
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ python3 setup.py install
 + 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.2.6；
+- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.2.7；
 ## Known Issue
 - 无

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -27,7 +27,7 @@ Easy, fast, and cheap LLM serving for everyone
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 ---
+## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 vLLM is fast with:

--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
-#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
@@ -36,6 +37,7 @@ void silu_and_mul(
  dim3 grid(num_tokens);
  dim3 block(std::min(d, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    input.scalar_type(),
@@ -71,6 +73,7 @@ __global__ void activation_kernel(
  int64_t num_tokens = input.numel() / d;                                                 \
  dim3 grid(num_tokens);                                                                  \
  dim3 block(std::min(d, 1024));                                                          \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
    input.scalar_type(),                                                                  \

--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -21,6 +21,7 @@
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
@@ -616,6 +617,7 @@ void paged_attention_v1_launcher(
  dim3 grid(num_heads, num_seqs, 1);
  dim3 block(NUM_THREADS);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  switch (head_size) {
    // NOTE(woosuk): To reduce the compilation time, we only compile for the
@@ -784,6 +786,7 @@ void paged_attention_v2_launcher(
  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
  dim3 block(NUM_THREADS);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  switch (head_size) {
    // NOTE(woosuk): To reduce the compilation time, we only compile for the

--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
@@ -33,6 +34,7 @@ void swap_blocks(
  char *dst_ptr = static_cast<char*>(dst.data_ptr());
  const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
+  const at::cuda::OptionalCUDAGuard device_guard(src_device);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // NOTE(woosuk): This can be slow if the number of blocks is large.
  for (const auto& pair : block_mapping) {
@@ -127,6 +129,7 @@ void copy_blocks(
  const int numel_per_block = key_caches[0][0].numel();
  dim3 grid(num_layers, num_pairs);
  dim3 block(std::min(1024, numel_per_block));
+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
@@ -207,6 +210,7 @@ void reshape_and_cache(
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * head_size, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    key.scalar_type(),
@@ -367,6 +371,7 @@ void gather_cached_kv(
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * head_size, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    key.scalar_type(),

--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include "dispatch_utils.h"
 #include "reduction_utils.cuh"
@@ -76,6 +77,7 @@ void rms_norm(
  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    input.scalar_type(),
@@ -101,6 +103,7 @@ void fused_add_rms_norm(
  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    input.scalar_type(),

--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
@@ -94,6 +95,7 @@ void rotary_embedding(
  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
    query.scalar_type(),

--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -287,7 +287,8 @@ void gemm_half_q_half_cuda_part
    fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count);
-    kernel<<<gridDim, blockDim>>>
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    kernel<<<gridDim, blockDim, 0, stream>>>
    (
        a,
        b_q_weight,
@@ -434,7 +435,8 @@ void reconstruct_exllama
    gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
    gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
-    reconstruct_exllama_kernel<<<gridDim, blockDim>>>
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>
    (
        b_q_weight,
        b_q_perm,
@@ -567,7 +569,8 @@ void gemm_half_q_half_alt
    gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX);
    gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
-    gemm_half_q_half_alt_kernel<<<gridDim, blockDim>>>
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    gemm_half_q_half_alt_kernel<<<gridDim, blockDim, 0, stream>>>
    (
        (const half2*) a,
        b_q_weight,
@@ -639,7 +642,8 @@ void reconstruct_gptq
    blockDim.y = 1;
    gridDim.y = DIVIDE(height, 8);
    gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
-    reconstruct_gptq_kernel<<<gridDim, blockDim>>>
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    reconstruct_gptq_kernel<<<gridDim, blockDim, 0, stream>>>
    (
        b_q_weight,
        b_gptq_scales,
@@ -794,7 +798,8 @@ void shuffle_exllama_weight
        gridDim.x = DIVIDE(width, THREADS_X);
        gridDim.y = height / 8;
-        make_sequential_kernel<<<gridDim, blockDim>>>
+        const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        make_sequential_kernel<<<gridDim, blockDim, 0, stream>>>
        (
            q_weight,
            new_qweight,
@@ -813,7 +818,8 @@ void shuffle_exllama_weight
    blockDim.y = 1;
    gridDim.x = DIVIDE(width, THREADS_X);
    gridDim.y = 1;
-    shuffle_kernel<<<gridDim, blockDim>>>(q_weight, height, width);
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width);
 }
 }  // namespace gptq

--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -7,6 +7,7 @@
 // half-tensor
 #include <c10/cuda/CUDAStream.h>
 #include <ATen/cuda/CUDATensorMethods.cuh>
+#include <c10/cuda/CUDAGuard.h>
 #define BLOCKWIDTH 128
 #define BLOCKHEIGHT4 16
@@ -200,7 +201,9 @@ void squeezellm_gemm(
  );
  dim3 threads(BLOCKWIDTH);
-  vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>(
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
 #ifndef USE_ROCM
    (half2*) vec.data<at::Half>(),
 #else

--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -58,11 +58,10 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
    +    positions: torch.Tensor,
    +    kv_caches: List[KVCache],
    +    input_metadata: InputMetadata,
-    +    cache_events: Optional[List[torch.cuda.Event]],
+    +) -> Optional[SamplerOutput]:
-    +) -> SamplerOutput:
-3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
+1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
-4. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
+2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.
 .. note::
    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.

--- a/examples/gradio_webserver.py
+++ b/examples/gradio_webserver.py
@@ -47,6 +47,6 @@ if __name__ == "__main__":
    args = parser.parse_args()
    demo = build_demo()
-    demo.queue(concurrency_count=100).launch(server_name=args.host,
+    demo.queue().launch(server_name=args.host,
-                                             server_port=args.port,
+                        server_port=args.port,
-                                             share=True)
+                        share=True)
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -3,8 +3,6 @@ typing-extensions>=4.8.0
 starlette
 psutil
 ray >= 2.5.1
-pandas  # Required for Ray data.
-pyarrow  # Required for Ray data.
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0

--- a/requirements.txt
+++ b/requirements.txt
 ninja  # For faster builds.
 psutil
 ray >= 2.5.1
-pandas  # Required for Ray data.
-pyarrow  # Required for Ray data.
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2

--- a/setup.py
+++ b/setup.py
@@ -300,8 +300,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
    version += ".torch" + torch.__version__[:3]
    with open(add_version_path, encoding="utf-8",mode="w") as file:
-        file.write("__version__='0.2.6'\n")
+        file.write("__version__='0.2.7'\n")
-        file.write("__dcu_version__='0.2.6+{}'\n".format(version))
+        file.write("__dcu_version__='0.2.7+{}'\n".format(version))
    file.close()

--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -8,11 +8,11 @@ import pytest
 import requests
-def _query_server(prompt: str) -> dict:
+def _query_server(prompt: str, max_tokens: int = 5) -> dict:
    response = requests.post("http://localhost:8000/generate",
                             json={
                                 "prompt": prompt,
-                                 "max_tokens": 100,
+                                 "max_tokens": max_tokens,
                                 "temperature": 0,
                                 "ignore_eos": True
                             })
@@ -20,6 +20,10 @@ def _query_server(prompt: str) -> dict:
    return response.json()
+def _query_server_long(prompt: str) -> dict:
+    return _query_server(prompt, max_tokens=500)
 @pytest.fixture
 def api_server():
    script_path = Path(__file__).parent.joinpath(
@@ -44,13 +48,14 @@ def test_api_server(api_server):
    """
    with Pool(32) as pool:
        # Wait until the server is ready
-        prompts = ["Hello world"] * 1
+        prompts = ["warm up"] * 1
        result = None
        while not result:
            try:
-                for _ in pool.map(_query_server, prompts):
+                for r in pool.map(_query_server, prompts):
+                    result = r
                    break
-            except Exception:
+            except requests.exceptions.ConnectionError:
                time.sleep(1)
        # Actual tests start here
@@ -63,12 +68,14 @@ def test_api_server(api_server):
        assert num_aborted_requests == 0
        # Try with 100 prompts
-        prompts = ["Hello world"] * 100
+        prompts = ["test prompt"] * 100
        for result in pool.map(_query_server, prompts):
            assert result
+    with Pool(32) as pool:
        # Cancel requests
-        pool.map_async(_query_server, prompts)
+        prompts = ["canceled requests"] * 100
+        pool.map_async(_query_server_long, prompts)
        time.sleep(0.01)
        pool.terminate()
        pool.join()
@@ -81,6 +88,6 @@ def test_api_server(api_server):
    # check that server still runs after cancellations
    with Pool(32) as pool:
        # Try with 100 prompts
-        prompts = ["Hello world"] * 100
+        prompts = ["test prompt after canceled"] * 100
        for result in pool.map(_query_server, prompts):
            assert result
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -8,8 +8,9 @@ from transformers import AutoModelForCausalLM
 from vllm import LLM, SamplingParams
 from vllm.transformers_utils.tokenizer import get_tokenizer
-_TEST_PROMPTS = ["prompts/example.txt"]
+_TEST_DIR = os.path.dirname(__file__)
-_LONG_PROMPTS = ["prompts/summary.txt"]
+_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
+_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 def _read_prompts(filename: str) -> str:
@@ -24,7 +25,7 @@ def _read_prompts(filename: str) -> str:
 def example_prompts() -> List[str]:
    prompts = []
    for filename in _TEST_PROMPTS:
-        prompts += _read_prompts(os.path.join("tests", filename))
+        prompts += _read_prompts(filename)
    return prompts
@@ -32,7 +33,7 @@ def example_prompts() -> List[str]:
 def example_long_prompts() -> List[str]:
    prompts = []
    for filename in _LONG_PROMPTS:
-        prompts += _read_prompts(os.path.join("tests", filename))
+        prompts += _read_prompts(filename)
    return prompts

--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -8,7 +8,7 @@ import pytest
 import torch
 from vllm.config import ParallelConfig
-from vllm.engine.ray_utils import get_open_port
+from vllm.utils import get_open_port
 from vllm.model_executor.parallel_utils.communication_op import (
    tensor_model_parallel_all_reduce,
    tensor_model_parallel_all_gather,

--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
@@ -12,6 +12,7 @@ def create_kv_caches(
    head_size: int,
    dtype: torch.dtype,
    seed: int,
+    device: str,
 ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
@@ -23,7 +24,7 @@ def create_kv_caches(
    for _ in range(num_layers):
        key_cache = torch.empty(size=key_cache_shape,
                                dtype=dtype,
-                                device='cuda')
+                                device=device)
        key_cache.uniform_(-scale, scale)
        key_caches.append(key_cache)
@@ -32,7 +33,7 @@ def create_kv_caches(
    for _ in range(num_layers):
        value_cache = torch.empty(size=value_cache_shape,
                                  dtype=dtype,
-                                  device='cuda')
+                                  device=device)
        value_cache.uniform_(-scale, scale)
        value_caches.append(value_cache)
    return key_caches, value_caches

--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -7,22 +7,26 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
 SEEDS = [0]
+DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_silu_and_mul(
    num_tokens: int,
    d: int,
    dtype: torch.dtype,
    seed: int,
+    device: int,
 ) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda")
+    gpu_id = f"cuda:{device}"
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype, device=gpu_id)
    layer = SiluAndMul()
    out = layer(x)
    ref_out = layer._forward(x)
@@ -33,16 +37,19 @@ def test_silu_and_mul(
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_gelu_new(
    num_tokens: int,
    d: int,
    dtype: torch.dtype,
    seed: int,
+    device: int,
 ) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
+    gpu_id = f"cuda:{device}"
+    x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
    layer = NewGELU()
    out = layer(x)
    ref_out = layer._forward(x)
@@ -53,15 +60,18 @@ def test_gelu_new(
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
 def test_gelu_fast(
    num_tokens: int,
    d: int,
    dtype: torch.dtype,
    seed: int,
+    device: int,
 ) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
+    gpu_id = f"cuda:{device}"
+    x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
    layer = FastGELU()
    out = layer(x)
    ref_out = layer._forward(x)

--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -24,6 +24,7 @@ HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
 SEEDS = [0]
+DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 def ref_masked_attention(
@@ -87,7 +88,7 @@ def ref_single_query_cached_kv_attention(
        alibi_bias = None
        if alibi_slopes is not None:
            # Create the ALiBi bias used in the paged attention kernel.
-            position_ids = torch.arange(context_len, device="cuda").int()
+            position_ids = torch.arange(context_len, device=query.device).int()
            alibi_bias = (position_ids - context_len + 1).float()
            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
                1, 1, -1)
@@ -105,6 +106,7 @@ def ref_single_query_cached_kv_attention(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
 def test_paged_attention(
    kv_cache_factory,
    version: str,
@@ -115,18 +117,19 @@ def test_paged_attention(
    block_size: int,
    dtype: torch.dtype,
    seed: int,
+    device: int,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
+    gpu_id = f"cuda:{device}"
    scale = float(1.0 / (head_size**0.5))
    num_query_heads, num_kv_heads = num_heads
    query = torch.empty(num_seqs,
                        num_query_heads,
                        head_size,
                        dtype=dtype,
-                        device="cuda")
+                        device=gpu_id)
    query.uniform_(-scale, scale)
    assert num_query_heads % num_kv_heads == 0
@@ -135,12 +138,12 @@ def test_paged_attention(
    if use_alibi:
        alibi_slopes = torch.randn(num_query_heads,
                                   dtype=torch.float,
-                                   device="cuda")
+                                   device=gpu_id)
    context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
    context_lens[-1] = MAX_SEQ_LEN
    max_context_len = max(context_lens)
-    context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
+    context_lens = torch.tensor(context_lens, dtype=torch.int, device=gpu_id)
    # Create the block tables.
    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
@@ -151,12 +154,12 @@ def test_paged_attention(
            for _ in range(max_num_blocks_per_seq)
        ]
        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
+    block_tables = torch.tensor(block_tables, dtype=torch.int, device=gpu_id)
    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
                                                num_kv_heads, head_size, dtype,
-                                                seed)
+                                                seed, gpu_id)
    key_cache, value_cache = key_caches[0], value_caches[0]
    # Call the paged attention kernel.
@@ -249,7 +252,7 @@ def ref_multi_query_kv_attention(
        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
                               diagonal=1)
        attn_mask = attn_mask * torch.finfo(dtype).min
-        attn_mask = attn_mask.to(dtype=dtype, device="cuda")
+        attn_mask = attn_mask.to(dtype=dtype, device=query.device)
        ref_output = ref_masked_attention(
            query[start_idx:end_idx],
@@ -269,6 +272,7 @@ def ref_multi_query_kv_attention(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_multi_query_kv_attention(
    num_seqs: int,
@@ -276,11 +280,12 @@ def test_multi_query_kv_attention(
    head_size: int,
    dtype: torch.dtype,
    seed: int,
+    device: int,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
+    gpu_id = f"cuda:{device}"
    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
    # As the xformers library is already tested with its own tests, we can use
    # a smaller MAX_SEQ_LEN here.
@@ -294,7 +299,7 @@ def test_multi_query_kv_attention(
                      num_query_heads + 2 * num_kv_heads,
                      head_size,
                      dtype=dtype,
-                      device="cuda")
+                      device=gpu_id)
    qkv.uniform_(-scale, scale)
    query, key, value = qkv.split(
        [num_query_heads, num_kv_heads, num_kv_heads], dim=1)