merge v0.3.3

e00b0a19 · zhuwenwen · ead94d93 · 3f1166ab · e00b0a19 · e00b0a19
Commit e00b0a19 authored Mar 23, 2024 by zhuwenwen
20 changed files
--- a/tests/async_engine/test_openai_server.py
+++ b/tests/async_engine/test_openai_server.py
-from argparse import Namespace
 from dataclasses import dataclass
 import os
 import pathlib
 import pytest
-from fastapi.testclient import TestClient
-from vllm.entrypoints.openai.api_server import *
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
    __file__))).parent.parent / "examples/template_chatml.jinja"
@@ -48,7 +48,6 @@ TEST_MESSAGES = [
        'content': 'What is the capital of'
    },
 ]
-client = TestClient(app)
 @dataclass
@@ -56,13 +55,17 @@ class MockTokenizer:
    chat_template = None
+@dataclass
+class MockServingChat:
+    tokenizer: MockTokenizer
 def test_load_chat_template():
    # Testing chatml template
-    mock_args = Namespace(chat_template=chatml_jinja_path)
    tokenizer = MockTokenizer()
+    mock_serving_chat = MockServingChat(tokenizer)
-    # Call the function with the mocked args
+    OpenAIServingChat._load_chat_template(mock_serving_chat,
-    load_chat_template(mock_args, tokenizer)
+                                          chat_template=chatml_jinja_path)
    template_content = tokenizer.chat_template
@@ -76,11 +79,11 @@ def test_load_chat_template():
 def test_no_load_chat_template():
    # Testing chatml template
    template = "../../examples/does_not_exist"
-    mock_args = Namespace(chat_template=template)
    tokenizer = MockTokenizer()
-    # Call the function with the mocked args
+    mock_serving_chat = MockServingChat(tokenizer)
-    load_chat_template(mock_args, tokenizer=tokenizer)
+    OpenAIServingChat._load_chat_template(mock_serving_chat,
+                                          chat_template=template)
    template_content = tokenizer.chat_template
    # Test assertions
@@ -97,9 +100,9 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
                              expected_output):
    # Initialize the tokenizer
    tokenizer = get_tokenizer(tokenizer_name=model)
+    mock_serving_chat = MockServingChat(tokenizer)
-    mock_args = Namespace(chat_template=template)
+    OpenAIServingChat._load_chat_template(mock_serving_chat,
-    load_chat_template(mock_args, tokenizer)
+                                          chat_template=template)
    # Create a mock request object using keyword arguments
    mock_request = ChatCompletionRequest(
@@ -115,8 +118,3 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
    # Test assertion
    assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
-def test_health_endpoint():
-    response = client.get("/health")
-    assert response.status_code == 200
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -64,7 +64,7 @@ def test_request_tracker():
    stream_5 = tracker.add_request("5")
    assert tracker.new_requests_event.flag
    tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
+        RequestOutput("2", "output", [], [], [], bool(finished)))
    new, finished = tracker.get_new_and_finished_requests()
    assert not tracker.new_requests_event.flag
    assert len(finished) == 1

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+Run `pytest tests/basic_correctness/test_basic_correctness.py --forked`.
+"""
+import pytest
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+]
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,12 +13,10 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
-def _read_prompts(filename: str) -> str:
+def _read_prompts(filename: str) -> List[str]:
-    prompts = []
    with open(filename, "r") as f:
-        prompt = f.readline()
+        prompts = f.readlines()
-        prompts.append(prompt)
+        return prompts
-    return prompts
 @pytest.fixture
@@ -165,6 +163,9 @@ class VllmRunner:
        model_name: str,
        tokenizer_name: Optional[str] = None,
        dtype: str = "half",
+        disable_log_stats: bool = True,
+        tensor_parallel_size: int = 1,
+        **kwargs,
    ) -> None:
        self.model = LLM(
            model=model_name,
@@ -172,6 +173,9 @@ class VllmRunner:
            trust_remote_code=True,
            dtype=dtype,
            swap_space=0,
+            disable_log_stats=disable_log_stats,
+            tensor_parallel_size=tensor_parallel_size,
+            **kwargs,
        )
    def generate(
@@ -195,6 +199,24 @@ class VllmRunner:
            outputs.append((req_sample_output_ids, req_sample_output_strs))
        return outputs
+    def generate_w_logprobs(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+    ) -> List[Tuple[List[int], str]]:
+        assert sampling_params.logprobs is not None
+        req_outputs = self.model.generate(prompts,
+                                          sampling_params=sampling_params)
+        outputs = []
+        for req_output in req_outputs:
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = sample.token_ids
+                output_logprobs = sample.logprobs
+            outputs.append((output_ids, output_str, output_logprobs))
+        return outputs
    def generate_greedy(
        self,
        prompts: List[str],
@@ -205,6 +227,20 @@ class VllmRunner:
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]
+    def generate_greedy_logprobs(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+        num_logprobs: int,
+    ) -> List[Tuple[List[int], str]]:
+        greedy_logprobs_params = SamplingParams(temperature=0.0,
+                                                max_tokens=max_tokens,
+                                                logprobs=num_logprobs)
+        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+        return [(output_ids, output_str, output_logprobs)
+                for output_ids, output_str, output_logprobs in outputs]
    def generate_beam_search(
        self,
        prompts: List[str],

--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
+"""
+import pytest
+import torch
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+]
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+    vllm_model = vllm_runner(model, dtype=dtype, tensor_parallel_size=2)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -6,24 +6,13 @@ import pytest
 import torch
 import ray
-from vllm.config import ParallelConfig
-from vllm.utils import get_open_port
 from vllm.model_executor.parallel_utils.communication_op import (
    tensor_model_parallel_all_reduce,
    tensor_model_parallel_all_gather,
+    broadcast_tensor_dict,
 )
-from vllm.worker.worker import _init_distributed_environment
+from vllm.test_utils import (init_test_distributed_environment,
+                             multi_process_tensor_parallel)
-def init_test_distributed_environment(pipeline_parallel_size: int,
-                                      tensor_parallel_size: int, rank: int,
-                                      distributed_init_port: str):
-    parallel_config = ParallelConfig(pipeline_parallel_size,
-                                     tensor_parallel_size,
-                                     worker_use_ray=True)
-    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
-    _init_distributed_environment(parallel_config, rank,
-                                  distributed_init_method)
 @ray.remote(num_gpus=1, max_calls=1)
@@ -64,22 +53,40 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
        assert torch.allclose(t, expected)
+@ray.remote(num_gpus=1, max_calls=1)
+def broadcast_tensor_dict_test_worker(tensor_parallel_size: int, rank: int,
+                                      distributed_init_port: str):
+    init_test_distributed_environment(1, tensor_parallel_size, rank,
+                                      distributed_init_port)
+    test_dict = {
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        "b": torch.arange(16, dtype=torch.int8, device="cuda"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+    }
+    if rank == 0:
+        broadcast_tensor_dict(test_dict, src=0)
+    else:
+        recv_dict = broadcast_tensor_dict(src=0)
+        assert len(recv_dict) == len(test_dict)
+        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                    reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tensor_parallel_size", [2])
-@pytest.mark.parametrize("test_target",
+@pytest.mark.parametrize("test_target", [
-                         [all_reduce_test_worker, all_gather_test_worker])
+    all_reduce_test_worker, all_gather_test_worker,
+    broadcast_tensor_dict_test_worker
+])
 def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
-    # Using ray helps debugging the error when it failed
+    multi_process_tensor_parallel(tensor_parallel_size, test_target)
-    # as compared to multiprocessing.
-    ray.init()
-    distributed_init_port = get_open_port()
-    refs = []
-    for rank in range(tensor_parallel_size):
-        refs.append(
-            test_target.remote(tensor_parallel_size, rank,
-                               distributed_init_port))
-    ray.get(refs)
-    ray.shutdown()
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
+import random
+import os
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+from vllm.model_executor.parallel_utils import custom_all_reduce as custom_ar
+from vllm.model_executor.parallel_utils.communication_op import (
+    tensor_model_parallel_all_reduce)
+from vllm.test_utils import (init_test_distributed_environment,
+                             multi_process_tensor_parallel)
+random.seed(42)
+test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
+for i, v in enumerate(test_sizes):
+    test_sizes[i] -= v % 8
+@ray.remote(num_gpus=1, max_calls=1)
+def graph_allreduce(world_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, world_size, rank,
+                                      distributed_init_port)
+    custom_ar.init_custom_ar()
+    for sz in test_sizes:
+        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            with custom_ar.capture():
+                # use integers so result matches NCCL exactly
+                inp1 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                inp2 = torch.randint(1,
+                                     16, (sz, ),
+                                     dtype=dtype,
+                                     device=torch.cuda.current_device())
+                torch.cuda.synchronize()
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph):
+                    out1 = tensor_model_parallel_all_reduce(inp1)
+                    # the input buffer is immediately modified to test
+                    # synchronization
+                    dist.all_reduce(inp1)
+                    out2 = tensor_model_parallel_all_reduce(inp2)
+                    dist.all_reduce(inp2)
+            graph.replay()
+            assert torch.allclose(out1, inp1)
+            assert torch.allclose(out2, inp2)
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_allreduce(world_size, rank, distributed_init_port):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(1, world_size, rank,
+                                      distributed_init_port)
+    sz = 1024
+    custom_ar.init_custom_ar()
+    fa = custom_ar.get_handle()
+    inp = torch.ones(sz, dtype=torch.float32, device=device)
+    out = fa.all_reduce_unreg(inp)
+    assert torch.allclose(out, inp * world_size)
+    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+    out = fa.all_reduce_unreg(inp)
+    assert torch.allclose(out, inp * world_size)
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
+def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
+    multi_process_tensor_parallel(tensor_parallel_size, test_target)
+if __name__ == "__main__":
+    multi_process_tensor_parallel(2, graph_allreduce)
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
+# This unit test should be moved to a new
+# tests/test_guided_decoding directory.
+from transformers import AutoTokenizer
+import torch
+from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor,
+                                                          JSONLogitsProcessor)
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
+             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+def test_guided_logits_processors():
+    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
+    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+    regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
+    json_LP = JSONLogitsProcessor(TEST_SCHEMA, tokenizer)
+    regex_LP.init_state()
+    token_ids = tokenizer.encode(
+        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    regex_LP(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+    json_LP.init_state()
+    token_ids = tokenizer.encode(
+        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    json_LP(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
+import os
+import subprocess
+import time
+import sys
+import pytest
+import requests
+import ray  # using Ray for overall ease of process management, parallel requests, and debugging.
+import openai  # use the official client for correctness check
+from huggingface_hub import snapshot_download  # downloading lora to test lora requests
+# imports for guided decoding tests
+import json
+import jsonschema
+import re
+from vllm.transformers_utils.tokenizer import get_tokenizer
+MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"  # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
+             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+TEST_CHOICE = [
+    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
+    "Swift", "Kotlin"
+]
+pytestmark = pytest.mark.asyncio
+@ray.remote(num_gpus=1)
+class ServerRunner:
+    def __init__(self, args):
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"
+        self.proc = subprocess.Popen(
+            ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+        self._wait_for_server()
+    def ready(self):
+        return True
+    def _wait_for_server(self):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(
+                        "http://localhost:8000/health").status_code == 200:
+                    break
+            except Exception as err:
+                if self.proc.poll() is not None:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+                time.sleep(0.5)
+                if time.time() - start > MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err
+    def __del__(self):
+        if hasattr(self, "proc"):
+            self.proc.terminate()
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+@pytest.fixture(scope="session")
+def server(zephyr_lora_files):
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        MODEL_NAME,
+        "--dtype",
+        "bfloat16",  # use half precision for speed and memory savings in CI environment
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128"
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+@pytest.fixture(scope="session")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+async def test_check_models(server, client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_single_completion(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+                                   model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=10)
+    assert chat_completion.id is not None
+    assert chat_completion.choices is not None and len(
+        chat_completion.choices) == 1
+    assert chat_completion.choices[0].message is not None
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.top_logprobs is not None
+    assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 10
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_streaming(server, client: openai.AsyncOpenAI,
+                                    model_name: str):
+    prompt = "What is an LLM?"
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    single_usage = single_completion.usage
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.usage == single_usage
+    assert "".join(chunks) == single_output
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_streaming(server, client: openai.AsyncOpenAI,
+                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks = []
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert "".join(chunks) == output
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_batch_completions(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    # test simple list
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(batch.choices) == 2
+    assert batch.choices[0].text == batch.choices[1].text
+    # test n = 2
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        n=2,
+        max_tokens=5,
+        temperature=0.0,
+        extra_body=dict(
+            # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client.
+            use_beam_search=True),
+    )
+    assert len(batch.choices) == 4
+    assert batch.choices[0].text != batch.choices[
+        1].text, "beam search should be different"
+    assert batch.choices[0].text == batch.choices[
+        2].text, "two copies of the same prompt should be the same"
+    assert batch.choices[1].text == batch.choices[
+        3].text, "two copies of the same prompt should be the same"
+    # test streaming
+    batch = await client.completions.create(
+        model=model_name,
+        prompt=["Hello, my name is", "Hello, my name is"],
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+    )
+    texts = [""] * 2
+    async for chunk in batch:
+        assert len(chunk.choices) == 1
+        choice = chunk.choices[0]
+        texts[choice.index] += choice.text
+    assert texts[0] == texts[1]
+async def test_logits_bias(server, client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+        seed=42,
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=
+        f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA))
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 3
+    for i in range(3):
+        assert completion.choices[i].text is not None
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "Give an example JSON for an employee profile that " + \
+                    f"fits this schema: {TEST_SCHEMA}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json1 = json.loads(message.content)
+    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+    messages.append({"role": "assistant", "content": message.content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json2 = json.loads(message.content)
+    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+async def test_guided_regex_completion(server, client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX))
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 3
+    for i in range(3):
+        assert completion.choices[i].text is not None
+        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
+async def test_guided_regex_chat(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example IP address with this regex: {TEST_REGEX}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX))
+    ip1 = chat_completion.choices[0].message.content
+    assert ip1 is not None
+    assert re.fullmatch(TEST_REGEX, ip1) is not None
+    messages.append({"role": "assistant", "content": ip1})
+    messages.append({"role": "user", "content": "Give me a different one"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX))
+    ip2 = chat_completion.choices[0].message.content
+    assert ip2 is not None
+    assert re.fullmatch(TEST_REGEX, ip2) is not None
+    assert ip1 != ip2
+async def test_guided_choice_completion(server, client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE))
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in TEST_CHOICE
+async def test_guided_choice_chat(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE))
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in TEST_CHOICE
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({
+        "role": "user",
+        "content": "I disagree, pick another one"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE))
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in TEST_CHOICE
+    assert choice1 != choice2
+async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42))
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=MODEL_NAME,
+                                                 messages=messages,
+                                                 extra_body=dict(guided_regex={
+                                                     1: "Python",
+                                                     2: "C++"
+                                                 }))
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/tests/kernels/allclose_default.py
+++ b/tests/kernels/allclose_default.py
+import torch
+# Reference default values of atol and rtol are from
+# https://github.com/pytorch/pytorch/blob/6d96beb6bec24d73ee3f080bac54d2104068f675/test/test_transformers.py#L67
+default_atol = {torch.float16: 1e-3, torch.bfloat16: 1e-3, torch.float: 1e-5}
+default_rtol = {
+    torch.float16: 1e-3,
+    torch.bfloat16: 1.6e-2,
+    torch.float: 1.3e-6
+}
+def get_default_atol(output) -> float:
+    return default_atol[output.dtype]
+def get_default_rtol(output) -> float:
+    return default_rtol[output.dtype]
--- a/tests/kernels/conftest.py
+++ b/tests/kernels/conftest.py
-from typing import List, Tuple
 import pytest
-import torch
+from vllm.utils import create_kv_caches_with_random
-def create_kv_caches(
-    num_blocks: int,
-    block_size: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    scale = head_size**-0.5
-    x = 16 // torch.tensor([], dtype=dtype).element_size()
-    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_caches = []
-    for _ in range(num_layers):
-        key_cache = torch.empty(size=key_cache_shape,
-                                dtype=dtype,
-                                device=device)
-        key_cache.uniform_(-scale, scale)
-        key_caches.append(key_cache)
-    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_caches = []
-    for _ in range(num_layers):
-        value_cache = torch.empty(size=value_cache_shape,
-                                  dtype=dtype,
-                                  device=device)
-        value_cache.uniform_(-scale, scale)
-        value_caches.append(value_cache)
-    return key_caches, value_caches
 @pytest.fixture()
 def kv_cache_factory():
-    return create_kv_caches
+    return create_kv_caches_with_random
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
+from typing import Type
 import pytest
 import torch
-from vllm.model_executor.layers.activation import FastGELU, NewGELU, SiluAndMul
+from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
+                                                   NewGELU, SiluAndMul)
+from allclose_default import get_default_atol, get_default_rtol
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+@pytest.mark.parametrize("activation", [SiluAndMul, GeluAndMul])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_silu_and_mul(
+def test_act_and_mul(
+    activation: Type[torch.nn.Module],
    num_tokens: int,
    d: int,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
-    gpu_id = f"cuda:{device}"
+        torch.cuda.manual_seed(seed)
-    x = torch.randn(num_tokens, 2 * d, dtype=dtype, device=gpu_id)
+    torch.set_default_device(device)
-    layer = SiluAndMul()
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
+    layer = activation()
    out = layer(x)
    ref_out = layer._forward(x)
-    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
+    # The SiLU and GELU implementations are equivalent to the native PyTorch
+    # implementations, so we can do exact comparison.
+    assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
+@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_gelu_new(
+def test_activation(
-    num_tokens: int,
+    activation: Type[torch.nn.Module],
-    d: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: int,
-) -> None:
-    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    gpu_id = f"cuda:{device}"
-    x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
-    layer = NewGELU()
-    out = layer(x)
-    ref_out = layer._forward(x)
-    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
-@pytest.mark.parametrize("d", D)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
-def test_gelu_fast(
    num_tokens: int,
    d: int,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
-    gpu_id = f"cuda:{device}"
+        torch.cuda.manual_seed(seed)
-    x = torch.randn(num_tokens, d, dtype=dtype, device=gpu_id)
+    torch.set_default_device(device)
-    layer = FastGELU()
+    x = torch.randn(num_tokens, d, dtype=dtype)
+    layer = activation()
    out = layer(x)
    ref_out = layer._forward(x)
-    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
+    assert torch.allclose(out,
+                          ref_out,
+                          atol=get_default_atol(out),
+                          rtol=get_default_rtol(out))
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -6,25 +6,38 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-from vllm._C import ops
+from vllm._C import ops, cache_ops
 from vllm.utils import get_max_shared_memory_bytes
+from vllm.utils import is_hip
+from allclose_default import get_default_atol, get_default_rtol
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
-NUM_BLOCKS = 12000  # Arbitrary values for testing
+# There may not be enough gpu memory due to large NUM_BLOCKS.
+# Reduce NUM_BLOCKS when it happens.
+NUM_BLOCKS = 4321  # Arbitrary values for testing
 PARTITION_SIZE = 512
+# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
-DTYPES = [torch.half, torch.bfloat16, torch.float]
+DTYPES = [torch.half, torch.bfloat16, torch.float
+          ] if not is_hip() else [torch.half, torch.bfloat16]
 NUM_GEN_SEQS = [7]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 96, 112, 128, 256]
+# FlashAttention forward only supports head dimension at most 128
+# https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
+HEAD_SIZES = [64, 80, 96, 112, 128, 256
+              ] if not is_hip() else [64, 80, 96, 112, 128]
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
+KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 def ref_masked_attention(
@@ -88,7 +101,7 @@ def ref_single_query_cached_kv_attention(
        alibi_bias = None
        if alibi_slopes is not None:
            # Create the ALiBi bias used in the paged attention kernel.
-            position_ids = torch.arange(context_len, device=query.device).int()
+            position_ids = torch.arange(context_len).int()
            alibi_bias = (position_ids - context_len + 1).float()
            alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view(
                1, 1, -1)
@@ -105,8 +118,9 @@ def ref_single_query_cached_kv_attention(
 @pytest.mark.parametrize("use_alibi", USE_ALIBI)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_paged_attention(
    kv_cache_factory,
    version: str,
@@ -116,34 +130,30 @@ def test_paged_attention(
    use_alibi: bool,
    block_size: int,
    dtype: torch.dtype,
+    kv_cache_dtype: str,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
-    gpu_id = f"cuda:{device}"
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
    scale = float(1.0 / (head_size**0.5))
    num_query_heads, num_kv_heads = num_heads
-    query = torch.empty(num_seqs,
+    query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype)
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype,
-                        device=gpu_id)
    query.uniform_(-scale, scale)
    assert num_query_heads % num_kv_heads == 0
    num_queries_per_kv = num_query_heads // num_kv_heads
    alibi_slopes = None
    if use_alibi:
-        alibi_slopes = torch.randn(num_query_heads,
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-                                   dtype=torch.float,
-                                   device=gpu_id)
    context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
    context_lens[-1] = MAX_SEQ_LEN
    max_context_len = max(context_lens)
-    context_lens = torch.tensor(context_lens, dtype=torch.int, device=gpu_id)
+    context_lens = torch.tensor(context_lens, dtype=torch.int)
    # Create the block tables.
    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
@@ -154,12 +164,13 @@ def test_paged_attention(
            for _ in range(max_num_blocks_per_seq)
        ]
        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device=gpu_id)
+    block_tables = torch.tensor(block_tables, dtype=torch.int)
    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1,
-                                                num_kv_heads, head_size, dtype,
+                                                num_kv_heads, head_size,
-                                                seed, gpu_id)
+                                                kv_cache_dtype, dtype, seed,
+                                                device)
    key_cache, value_cache = key_caches[0], value_caches[0]
    # Call the paged attention kernel.
@@ -177,6 +188,7 @@ def test_paged_attention(
            block_size,
            max_context_len,
            alibi_slopes,
+            kv_cache_dtype,
        )
    elif version == "v2":
        num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
@@ -186,12 +198,10 @@ def test_paged_attention(
        tmp_output = torch.empty(
            size=(num_seqs, num_heads, num_partitions, head_size),
            dtype=output.dtype,
-            device=output.device,
        )
        exp_sums = torch.empty(
            size=(num_seqs, num_heads, num_partitions),
            dtype=torch.float32,
-            device=output.device,
        )
        max_logits = torch.empty_like(exp_sums)
        ops.paged_attention_v2(
@@ -209,11 +219,30 @@ def test_paged_attention(
            block_size,
            max_context_len,
            alibi_slopes,
+            kv_cache_dtype,
        )
    else:
        raise AssertionError(f"Unknown version: {version}")
    # Run the reference implementation.
+    if kv_cache_dtype == "fp8_e5m2":
+        # Convert cache data back to dtype.
+        x = 16 // torch.tensor([], dtype=dtype).element_size()
+        key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x,
+                           block_size, x)
+        dequantized_key_cache = torch.empty(size=key_cache_shape,
+                                            dtype=dtype,
+                                            device=device)
+        cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache)
+        key_cache = dequantized_key_cache
+        value_cache_shape = value_cache.shape
+        dequantized_value_cache = torch.empty(size=value_cache_shape,
+                                              dtype=dtype,
+                                              device=device)
+        cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache)
+        value_cache = dequantized_value_cache
    ref_output = torch.empty_like(query)
    ref_single_query_cached_kv_attention(
        ref_output,
@@ -230,7 +259,14 @@ def test_paged_attention(
    # NOTE(woosuk): Due to the kernel-level differences in the two
    # implementations, there is a small numerical difference in the two
    # outputs. Thus, we use a relaxed tolerance for the test.
-    assert torch.allclose(output, ref_output, atol=1e-3, rtol=1e-5)
+    atol = get_default_atol(output) if is_hip() else 1e-3
+    rtol = get_default_rtol(output) if is_hip() else 1e-5
+    # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error,
+    # so we use a relaxed tolerance for the test.
+    if kv_cache_dtype == "fp8_e5m2":
+        atol, rtol = 1e-2, 1e-5
+    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
 def ref_multi_query_kv_attention(
@@ -252,7 +288,7 @@ def ref_multi_query_kv_attention(
        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
                               diagonal=1)
        attn_mask = attn_mask * torch.finfo(dtype).min
-        attn_mask = attn_mask.to(dtype=dtype, device=query.device)
+        attn_mask = attn_mask.to(dtype=dtype)
        ref_output = ref_masked_attention(
            query[start_idx:end_idx],
@@ -272,7 +308,7 @@ def ref_multi_query_kv_attention(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_multi_query_kv_attention(
    num_seqs: int,
@@ -280,12 +316,13 @@ def test_multi_query_kv_attention(
    head_size: int,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
-    gpu_id = f"cuda:{device}"
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
    # As the xformers library is already tested with its own tests, we can use
    # a smaller MAX_SEQ_LEN here.
@@ -298,8 +335,7 @@ def test_multi_query_kv_attention(
    qkv = torch.empty(num_tokens,
                      num_query_heads + 2 * num_kv_heads,
                      head_size,
-                      dtype=dtype,
+                      dtype=dtype)
-                      device=gpu_id)
    qkv.uniform_(-scale, scale)
    query, key, value = qkv.split(
        [num_query_heads, num_kv_heads, num_kv_heads], dim=1)
@@ -331,4 +367,6 @@ def test_multi_query_kv_attention(
        scale,
        dtype,
    )
-    assert torch.allclose(output, ref_output, atol=1e-3, rtol=1e-5)
+    atol = get_default_atol(output) if is_hip() else 1e-3
+    rtol = get_default_rtol(output) if is_hip() else 1e-5
+    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -3,18 +3,28 @@ import random
 import pytest
 import torch
+from typing import Tuple
 from vllm._C import cache_ops
+from vllm.utils import is_hip
+COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [8, 16, 32]
-NUM_BLOCKS = [1024, 3600]  # Arbitrary values for testing
+# reduce the size for ROCm test to avoid HIP OOM
+NUM_BLOCKS = [1024, 36000] if not is_hip else [
+    1024, 10000
+]  # Arbitrary values for testing
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+KV_CACHE_DTYPE = ["auto", "fp8_e5m2"]
 @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@@ -25,7 +35,8 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
 @torch.inference_mode()
 def test_copy_blocks(
    kv_cache_factory,
@@ -37,12 +48,14 @@ def test_copy_blocks(
    num_blocks: int,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    kv_cache_dtype: str,
+    device: str,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
-    gpu_id = f"cuda:{device}"
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
    # Generate random block mappings where each source block is mapped to two
    # destination blocks.
    assert 2 * num_mappings <= num_blocks
@@ -59,7 +72,8 @@ def test_copy_blocks(
    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
                                                num_layers, num_heads,
-                                                head_size, dtype, seed, gpu_id)
+                                                head_size, kv_cache_dtype,
+                                                dtype, seed, device)
    # Clone the KV caches.
    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
@@ -91,7 +105,7 @@ def test_copy_blocks(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_reshape_and_cache(
    kv_cache_factory,
@@ -102,29 +116,25 @@ def test_reshape_and_cache(
    num_blocks: int,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
-    gpu_id = f"cuda:{device}"
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
    # Create a random slot mapping.
    num_slots = block_size * num_blocks
    slot_mapping = random.sample(range(num_slots), num_tokens)
-    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=gpu_id)
+    slot_mapping = torch.tensor(slot_mapping, dtype=torch.long)
-    qkv = torch.randn(num_tokens,
+    qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype)
-                      3,
-                      num_heads,
-                      head_size,
-                      dtype=dtype,
-                      device=gpu_id)
    _, key, value = qkv.unbind(dim=1)
    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
                                                num_heads, head_size, dtype,
-                                                seed, gpu_id)
+                                                None, seed, device)
    key_cache, value_cache = key_caches[0], value_caches[0]
    # Clone the KV caches.
@@ -133,7 +143,7 @@ def test_reshape_and_cache(
    # Call the reshape_and_cache kernel.
    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                slot_mapping)
+                                slot_mapping, "auto")
    # Run the reference implementation.
    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
@@ -149,3 +159,68 @@ def test_reshape_and_cache(
    assert torch.allclose(key_cache, cloned_key_cache)
    assert torch.allclose(value_cache, cloned_value_cache)
+@pytest.mark.parametrize("direction", COPYING_DIRECTION)
+@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_swap_blocks(
+    kv_cache_factory,
+    direction: Tuple[str, str],
+    num_mappings: int,
+    num_heads: int,
+    head_size: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    random.seed(seed)
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    src_device = device if direction[0] == "cuda" else 'cpu'
+    dst_device = device if direction[1] == "cuda" else 'cpu'
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    # For the same device, mapping must not overlap
+    if src_device == dst_device:
+        remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+        dst_blocks = random.sample(remaining_blocks, num_mappings)
+    else:
+        dst_blocks = random.sample(range(num_blocks), num_mappings)
+    block_mapping = dict(zip(src_blocks, dst_blocks))
+    # Create the KV caches on the first device.
+    src_key_caches, src_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
+        src_device)
+    # Create the KV caches on the second device.
+    dist_key_caches, dist_value_caches = kv_cache_factory(
+        num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed,
+        dst_device)
+    src_key_caches_clone = src_key_caches[0].clone()
+    src_value_caches_clone = src_value_caches[0].clone()
+    # Call the swap_blocks kernel.
+    cache_ops.swap_blocks(src_key_caches[0], dist_key_caches[0], block_mapping)
+    cache_ops.swap_blocks(src_value_caches[0], dist_value_caches[0],
+                          block_mapping)
+    for src, dst in block_mapping.items():
+        assert torch.allclose(src_key_caches_clone[src].cpu(),
+                              dist_key_caches[0][dst].cpu())
+        assert torch.allclose(src_value_caches_clone[src].cpu(),
+                              dist_value_caches[0][dst].cpu())
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -8,7 +8,9 @@ NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
 HIDDEN_SIZES = [768, 5120, 8192]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@@ -16,7 +18,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 @pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rms_norm(
    num_tokens: int,
@@ -24,15 +26,16 @@ def test_rms_norm(
    add_residual: bool,
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
 ) -> None:
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
-    gpu_id = f"cuda:{device}"
+        torch.cuda.manual_seed(seed)
-    layer = RMSNorm(hidden_size).to(dtype=dtype, device=gpu_id)
+    torch.set_default_device(device)
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
    layer.weight.data.normal_(mean=1.0, std=0.1)
    scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=gpu_id)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    x *= scale
    residual = torch.randn_like(x) * scale if add_residual else None

--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
+"""Tests for the MOE layers.
+Run `pytest tests/kernels/test_moe.py`.
+"""
+import pytest
+import torch
+from transformers import MixtralConfig
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.models.mixtral import MixtralMoE
+def torch_moe(a, w1, w2, score, topk):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(
+                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("n", [2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+def test_fused_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+):
+    a = torch.randn((m, k), device='cuda', dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device='cuda', dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device='cuda', dtype=dtype) / 10
+    score = torch.randn((m, e), device='cuda', dtype=dtype)
+    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
+    torch_output = torch_moe(a, w1, w2, score, topk)
+    assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0)
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_mixtral_moe(dtype: torch.dtype):
+    "Make sure our Mixtral MoE implementation agrees with the one from huggingface."
+    # Instantiate our and huggingface's MoE blocks
+    config = MixtralConfig()
+    hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
+    vllm_moe = MixtralMoE(
+        num_experts=config.num_local_experts,
+        top_k=config.num_experts_per_tok,
+        hidden_size=config.hidden_size,
+        intermediate_size=config.intermediate_size,
+        params_dtype=dtype,
+        tp_size=1,
+    ).cuda()
+    # Load the weights
+    vllm_moe.gate.linear_weights["weight"][:] = hf_moe.gate.weight.data
+    for i in range(config.num_local_experts):
+        weights = (hf_moe.experts[i].w1.weight.data,
+                   hf_moe.experts[i].w3.weight.data)
+        vllm_moe.ws[i][:] = torch.cat(weights, dim=0)
+        vllm_moe.w2s[i][:] = hf_moe.experts[i].w2.weight.data
+    # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
+    inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
+    # Run forward passes for both MoE blocks
+    hf_states, _ = hf_moe.forward(inputs)
+    vllm_states = vllm_moe.forward(inputs)
+    mixtral_moe_tol = {
+        torch.float32: 1e-3,
+        torch.float16: 1e-3,
+        torch.bfloat16: 1e-2,
+    }
+    assert torch.allclose(hf_states,
+                          vllm_states,
+                          rtol=mixtral_moe_tol[dtype],
+                          atol=mixtral_moe_tol[dtype])
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -2,7 +2,7 @@ from typing import Optional
 import pytest
 import torch
+from allclose_default import get_default_atol, get_default_rtol
 from vllm.model_executor.layers.rotary_embedding import get_rope
 IS_NEOX_STYLE = [True, False]
@@ -13,7 +13,9 @@ NUM_HEADS = [7, 17]  # Arbitrary values for testing
 BATCH_SIZES = [1, 5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@@ -24,7 +26,7 @@ DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
 @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rotary_embedding(
    is_neox_style: bool,
@@ -35,28 +37,26 @@ def test_rotary_embedding(
    rotary_dim: Optional[int],
    dtype: torch.dtype,
    seed: int,
-    device: int,
+    device: str,
    max_position: int = 8192,
    base: int = 10000,
 ) -> None:
    if rotary_dim is None:
        rotary_dim = head_size
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
-    gpu_id = f"cuda:{device}"
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
-    rope = rope.to(dtype=dtype, device=gpu_id)
+    rope = rope.to(dtype=dtype)
-    positions = torch.randint(0,
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
-                              max_position, (batch_size, seq_len),
-                              device=gpu_id)
    query = torch.randn(batch_size,
                        seq_len,
                        num_heads * head_size,
-                        dtype=dtype,
+                        dtype=dtype)
-                        device=gpu_id)
    key = torch.randn_like(query)
    # NOTE(woosuk): The reference implementation should be executed first
@@ -64,5 +64,11 @@ def test_rotary_embedding(
    ref_query, ref_key = rope._forward(positions, query, key)
    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
-    assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
+    assert torch.allclose(out_query,
-    assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
+                          ref_query,
+                          atol=get_default_atol(out_query),
+                          rtol=get_default_rtol(out_query))
+    assert torch.allclose(out_key,
+                          ref_key,
+                          atol=get_default_atol(out_key),
+                          rtol=get_default_rtol(out_key))
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
+import random
+import pytest
+import time
+import torch
+from vllm.model_executor.layers.triton_kernel.prefix_prefill import (
+    context_attention_fwd)
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
+NUM_HEADS = [64]
+NUM_QUERIES_PER_KV = [1, 8, 64]
+HEAD_SIZES = [128]
+DTYPES = [torch.float16]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    dtype: torch.dtype,
+    device: str,
+) -> None:
+    random.seed(0)
+    torch.manual_seed(0)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(0)
+    torch.set_default_device(device)
+    MAX_SEQ_LEN = 1024
+    MAX_CTX_LEN = 1024
+    BS = 10
+    cache_size = 640
+    block_size = 32
+    max_block_per_request = 64
+    subquery_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
+    seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
+    num_tokens = sum(subquery_lens)
+    query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    query.uniform_(-1e-3, 1e-3)
+    output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
+    kv.uniform_(-1e-3, 1e-3)
+    key, value = kv.unbind(dim=1)
+    k_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    v_cache = torch.zeros(cache_size,
+                          block_size,
+                          num_kv_heads,
+                          head_size,
+                          dtype=dtype)
+    k = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype)
+    values = torch.arange(0, cache_size, dtype=torch.long)
+    values = values[torch.randperm(cache_size)]
+    block_table = values[:BS * max_block_per_request].view(
+        BS, max_block_per_request)
+    b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
+    b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
+    b_start_loc = torch.cumsum(torch.tensor([0] + subquery_lens[:-1],
+                                            dtype=torch.long),
+                               dim=0)
+    max_input_len = MAX_SEQ_LEN
+    # copy kv to cache
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
+                                                dtype=torch.long),
+                                   dim=0)
+    for i in range(BS):
+        for j in range(subquery_lens[i]):
+            k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
+                                            j])
+            v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
+                                              b_ctx_len[i] + j])
+        cur_ctx = 0
+        block_id = 0
+        while cur_ctx < b_ctx_len[i]:
+            start_loc = b_seq_start_loc[i] + cur_ctx
+            if cur_ctx + block_size > b_ctx_len[i]:
+                end_loc = b_seq_start_loc[i] + b_ctx_len[i]
+            else:
+                end_loc = start_loc + block_size
+            start_slot = block_table[i, block_id] * block_size
+            end_slot = start_slot + end_loc - start_loc
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
+            cur_ctx += block_size
+            block_id += 1
+    # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
+    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
+                           8).permute(0, 2, 3, 1, 4).contiguous()
+    # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
+    # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
+    v_cache = v_cache.view(-1, block_size, num_kv_heads,
+                           head_size).permute(0, 2, 3, 1).contiguous()
+    # Warm up the Triton kernel by calling it once before actually measuring generation time
+    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
+                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
+    torch.cuda.synchronize()
+    start_time = time.time()
+    context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
+                          b_start_loc, b_seq_len, b_ctx_len, max_input_len)
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
+    scale = float(1.0 / (head_size**0.5))
+    attn_op = xops.fmha.cutlass.FwOp()
+    if num_kv_heads != num_heads:
+        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+        # project the key and value tensors to the desired number of
+        # heads.
+        #
+        # see also: vllm/model_executor/layers/attention.py
+        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
+                           query.shape[-1])
+        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
+                                        num_queries_per_kv, key.shape[-1])
+        value = value[:, :,
+                      None, :].expand(value.shape[0], num_kv_heads,
+                                      num_queries_per_kv, value.shape[-1])
+    query = query.unsqueeze(0)
+    key = key.unsqueeze(0)
+    value = value.unsqueeze(0)
+    attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+        subquery_lens, seq_lens)
+    output_ref = xops.memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+        op=attn_op,
+    )
+    torch.cuda.synchronize()
+    start_time = time.time()
+    output_ref = xops.memory_efficient_attention_forward(
+        query,
+        key,
+        value,
+        attn_bias=attn_bias,
+        p=0.0,
+        scale=scale,
+        op=attn_op,
+    )
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
+    output_ref = output_ref.squeeze(0, 2)
+    assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
--- a/tests/lora/__init__.py
+++ b/tests/lora/__init__.py
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
+import contextlib
+import gc
+import tempfile
+from collections import OrderedDict
+from unittest.mock import patch, MagicMock
+import pytest
+import ray
+import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
+import vllm
+from vllm.config import LoRAConfig
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parallel_utils.parallel_state import (
+    destroy_model_parallel, initialize_model_parallel)
+def cleanup():
+    destroy_model_parallel()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+    ray.shutdown()
+@pytest.fixture(autouse=True)
+def cleanup_fixture():
+    yield
+    cleanup()
+@pytest.fixture
+def dist_init():
+    if not torch.distributed.is_initialized():
+        temp_file = tempfile.mkstemp()[1]
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=1,
+            rank=0,
+            init_method=f"file://{temp_file}",
+        )
+        torch.distributed.all_reduce(torch.zeros(1).cuda())
+    initialize_model_parallel(1, 1)
+    yield
+    cleanup()
+@pytest.fixture
+def dist_init_torch_only():
+    if torch.distributed.is_initialized():
+        return
+    temp_file = tempfile.mkstemp()[1]
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=1,
+        rank=0,
+        init_method=f"file://{temp_file}",
+    )
+@pytest.fixture
+def dummy_model() -> nn.Module:
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", ColumnParallelLinear(764, 100)),
+            ("dense2", RowParallelLinear(100, 50)),
+            (
+                "layer1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", ColumnParallelLinear(100, 10)),
+                        ("dense2", RowParallelLinear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("output", ColumnParallelLinear(50, 10)),
+            ("outact", nn.Sigmoid()),
+            # Special handling for lm_head & sampler
+            ("lm_head", ParallelLMHead(512, 10)),
+            ("sampler", Sampler(512))
+        ]))
+    model.config = MagicMock()
+    return model
+@pytest.fixture
+def dummy_model_gate_up() -> nn.Module:
+    model = nn.Sequential(
+        OrderedDict([
+            ("dense1", ColumnParallelLinear(764, 100)),
+            ("dense2", RowParallelLinear(100, 50)),
+            (
+                "layer1",
+                nn.Sequential(
+                    OrderedDict([
+                        ("dense1", ColumnParallelLinear(100, 10)),
+                        ("dense2", RowParallelLinear(10, 50)),
+                    ])),
+            ),
+            ("act2", nn.ReLU()),
+            ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
+            ("outact", nn.Sigmoid()),
+            # Special handling for lm_head & sampler
+            ("lm_head", ParallelLMHead(512, 10)),
+            ("sampler", Sampler(512))
+        ]))
+    model.config = MagicMock()
+    return model
+@pytest.fixture(scope="session")
+def sql_lora_files():
+    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+@pytest.fixture(scope="session")
+def mixtral_lora_files():
+    return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
+@pytest.fixture(scope="session")
+def gemma_lora_files():
+    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
+@pytest.fixture
+def llama_2_7b_engine_extra_embeddings() -> nn.Module:
+    cleanup()
+    get_model_old = get_model
+    def get_model_patched(model_config, device_config, **kwargs):
+        return get_model_old(model_config,
+                             device_config,
+                             lora_config=LoRAConfig(max_loras=4,
+                                                    max_lora_rank=8))
+    with patch("vllm.worker.model_runner.get_model", get_model_patched):
+        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+    yield engine.llm_engine
+    del engine
+    cleanup()
+@pytest.fixture
+def llama_2_7b_model_extra_embeddings(
+        llama_2_7b_engine_extra_embeddings) -> nn.Module:
+    yield llama_2_7b_engine_extra_embeddings.driver_worker.model_runner.model