Merge branch 'v0.6.3.post1-dev'

ad385667 · zhuwenwen · be0967c1 · 903593d3 · ad385667 · ad385667
Commit ad385667 authored Oct 23, 2024 by zhuwenwen
20 changed files
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
+import subprocess
+import sys
+import tempfile
+import time
+from http import HTTPStatus
+import openai
+import pytest
+import pytest_asyncio
+import requests
+from prometheus_client.parser import text_string_to_metric_families
+from transformers import AutoTokenizer
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+@pytest.fixture(scope="module",
+                params=[
+                    "",
+                    "--enable-chunked-prefill",
+                    "--disable-frontend-multiprocessing",
+                ])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args.append(request.param)
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as cl:
+        yield cl
+_PROMPT = "Hello my name is Robert and I love magic"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"]
+_NUM_REQUESTS = 10
+_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT)
+_NUM_GENERATION_TOKENS_PER_REQUEST = 10
+# {metric_family: [(suffix, expected_value)]}
+EXPECTED_VALUES = {
+    "vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:time_per_output_token_seconds":
+    [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
+    "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_prompt_tokens":
+    [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
+     ("_count", _NUM_REQUESTS)],
+    "vllm:request_generation_tokens":
+    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+     ("_count", _NUM_REQUESTS)],
+    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
+    "vllm:prompt_tokens": [("_total",
+                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
+    "vllm:generation_tokens":
+    [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
+    "vllm:request_success": [("_total", _NUM_REQUESTS)],
+}
+@pytest.mark.asyncio
+async def test_metrics_counts(client: openai.AsyncOpenAI):
+    base_url = str(client.base_url)[:-3].strip("/")
+    for _ in range(_NUM_REQUESTS):
+        # sending a request triggers the metrics to be logged.
+        await client.completions.create(
+            model=MODEL_NAME,
+            prompt=_TOKENIZED_PROMPT,
+            max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
+    response = requests.get(base_url + "/metrics")
+    print(response.text)
+    assert response.status_code == HTTPStatus.OK
+    # Loop over all expected metric_families
+    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+        found_metric = False
+        # Check to see if the metric_family is found in the prom endpoint.
+        for family in text_string_to_metric_families(response.text):
+            if family.name == metric_family:
+                found_metric = True
+                # Check that each suffix is found in the prom endpoint.
+                for suffix, expected_value in suffix_values_list:
+                    metric_name_w_suffix = f"{metric_family}{suffix}"
+                    found_suffix = False
+                    for sample in family.samples:
+                        if sample.name == metric_name_w_suffix:
+                            found_suffix = True
+                            # For each suffix, value sure the value matches
+                            # what we expect.
+                            assert sample.value == expected_value, (
+                                f"{metric_name_w_suffix} expected value of "
+                                f"{expected_value} did not match found value "
+                                f"{sample.value}")
+                            break
+                    assert found_suffix, (
+                        f"Did not find {metric_name_w_suffix} in prom endpoint"
+                    )
+                break
+        assert found_metric, (f"Did not find {metric_family} in prom endpoint")
+EXPECTED_METRICS = [
+    "vllm:num_requests_running",
+    "vllm:num_requests_swapped",
+    "vllm:num_requests_waiting",
+    "vllm:gpu_cache_usage_perc",
+    "vllm:cpu_cache_usage_perc",
+    "vllm:time_to_first_token_seconds_sum",
+    "vllm:time_to_first_token_seconds_bucket",
+    "vllm:time_to_first_token_seconds_count",
+    "vllm:time_per_output_token_seconds_sum",
+    "vllm:time_per_output_token_seconds_bucket",
+    "vllm:time_per_output_token_seconds_count",
+    "vllm:e2e_request_latency_seconds_sum",
+    "vllm:e2e_request_latency_seconds_bucket",
+    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_prompt_tokens_sum",
+    "vllm:request_prompt_tokens_bucket",
+    "vllm:request_prompt_tokens_count",
+    "vllm:request_generation_tokens_sum",
+    "vllm:request_generation_tokens_bucket",
+    "vllm:request_generation_tokens_count",
+    "vllm:request_params_n_sum",
+    "vllm:request_params_n_bucket",
+    "vllm:request_params_n_count",
+    "vllm:num_preemptions_total",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
+    "vllm:request_success_total",
+    "vllm:cache_config_info",
+    # labels in cache_config_info
+    "block_size",
+    "cache_dtype",
+    "cpu_offload_gb",
+    "enable_prefix_caching",
+    "gpu_memory_utilization",
+    "num_cpu_blocks",
+    "num_gpu_blocks",
+    "num_gpu_blocks_override",
+    "sliding_window",
+    "swap_space_bytes",
+]
+@pytest.mark.asyncio
+async def test_metrics_exist(client: openai.AsyncOpenAI):
+    base_url = str(client.base_url)[:-3].strip("/")
+    # sending a request triggers the metrics to be logged.
+    await client.completions.create(model=MODEL_NAME,
+                                    prompt="Hello, my name is",
+                                    max_tokens=5,
+                                    temperature=0.0)
+    response = requests.get(base_url + "/metrics")
+    assert response.status_code == HTTPStatus.OK
+    for metric in EXPECTED_METRICS:
+        assert metric in response.text
+def test_metrics_exist_run_batch():
+    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
+    base_url = "0.0.0.0"
+    port = "8001"
+    server_url = f"http://{base_url}:{port}"
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(input_batch)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.run_batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "intfloat/e5-mistral-7b-instruct",
+            "--enable-metrics",
+            "--url",
+            base_url,
+            "--port",
+            port,
+        ], )
+        def is_server_up(url):
+            try:
+                response = requests.get(url)
+                return response.status_code == 200
+            except requests.ConnectionError:
+                return False
+        while not is_server_up(server_url):
+            time.sleep(1)
+        response = requests.get(server_url + "/metrics")
+        assert response.status_code == HTTPStatus.OK
+        proc.wait()
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
@@ -43,18 +44,21 @@ def server(zephyr_lora_files):
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI):
+async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
    models = await client.models.list()
    models = models.data
    served_model = models[0]
    lora_models = models[1:]
    assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
+    assert served_model.root == MODEL_NAME
+    assert all(lora_model.root == zephyr_lora_files
+               for lora_model in lora_models)
    assert lora_models[0].id == "zephyr-lora"
    assert lora_models[1].id == "zephyr-lora2"
--- a/tests/entrypoints/openai/test_oot_registration.py
+++ b/tests/entrypoints/openai/test_oot_registration.py
-import sys
+from ...utils import VLLM_PATH, RemoteOpenAIServer
-import time
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
-import torch
+assert chatml_jinja_path.exists()
-from openai import OpenAI, OpenAIError
-from vllm import ModelRegistry
+def run_and_test_dummy_opt_api_server(model, tp=1):
-from vllm.model_executor.models.opt import OPTForCausalLM
+    # the model is registered through the plugin
-from vllm.model_executor.sampling_metadata import SamplingMetadata
+    server_args = [
-from vllm.utils import get_open_port
+        "--gpu-memory-utilization",
+        "0.10",
+        "--dtype",
-class MyOPTForCausalLM(OPTForCausalLM):
+        "float32",
+        "--chat-template",
-    def compute_logits(self, hidden_states: torch.Tensor,
+        str(chatml_jinja_path),
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        "--load-format",
-        # this dummy model always predicts the first token
+        "dummy",
-        logits = super().compute_logits(hidden_states, sampling_metadata)
+        "-tp",
-        logits.zero_()
+        f"{tp}",
-        logits[:, 0] += 1.0
+    ]
-        return logits
+    with RemoteOpenAIServer(model, server_args) as server:
+        client = server.get_client()
+        completion = client.chat.completions.create(
-def server_function(port):
+            model=model,
-    # register our dummy model
+            messages=[{
-    ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM)
+                "role": "system",
-    sys.argv = ["placeholder.py"] + \
+                "content": "You are a helpful assistant."
-        ("--model facebook/opt-125m --gpu-memory-utilization 0.10 "
+            }, {
-        f"--dtype float32 --api-key token-abc123 --port {port}").split()
+                "role": "user",
-    import runpy
+                "content": "Hello!"
-    runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
+            }],
+            temperature=0,
+        )
-def test_oot_registration_for_api_server():
+        generated_text = completion.choices[0].message.content
-    port = get_open_port()
+        assert generated_text is not None
-    ctx = torch.multiprocessing.get_context()
+        # make sure only the first token is generated
-    server = ctx.Process(target=server_function, args=(port, ))
+        rest = generated_text.replace("<s>", "")
-    server.start()
+        assert rest == ""
-    MAX_SERVER_START_WAIT_S = 60
-    client = OpenAI(
-        base_url=f"http://localhost:{port}/v1",
+def test_oot_registration_for_api_server(dummy_opt_path: str):
-        api_key="token-abc123",
+    run_and_test_dummy_opt_api_server(dummy_opt_path)
-    )
-    now = time.time()
-    while True:
-        try:
-            completion = client.chat.completions.create(
-                model="facebook/opt-125m",
-                messages=[{
-                    "role": "system",
-                    "content": "You are a helpful assistant."
-                }, {
-                    "role": "user",
-                    "content": "Hello!"
-                }],
-                temperature=0,
-            )
-            break
-        except OpenAIError as e:
-            if "Connection error" in str(e):
-                time.sleep(3)
-                if time.time() - now > MAX_SERVER_START_WAIT_S:
-                    raise RuntimeError("Server did not start in time") from e
-            else:
-                raise e
-    server.kill()
-    generated_text = completion.choices[0].message.content
-    # make sure only the first token is generated
-    rest = generated_text.replace("<s>", "")
-    assert rest == ""
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
+# imports for guided decoding tests
+import re
+import openai
+import pytest
+from ...utils import RemoteOpenAIServer
+@pytest.mark.asyncio
+async def test_empty_prompt():
+    model_name = "gpt2"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        with pytest.raises(openai.BadRequestError,
+                           match=re.compile('.+Prompt cannot be empty.+')):
+            await client.completions.create(model=model_name,
+                                            prompt="",
+                                            max_tokens=5,
+                                            temperature=0.0)
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -25,59 +25,63 @@ def server_with_return_tokens_as_token_ids_flag(
 @pytest.mark.asyncio
 async def test_completion_return_tokens_as_token_ids_completion(
        server_with_return_tokens_as_token_ids_flag):
-    client = server_with_return_tokens_as_token_ids_flag.get_async_client()
+    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
+    ) as client:
-    completion = await client.completions.create(
+        completion = await client.completions.create(
-        model=MODEL_NAME,
+            model=MODEL_NAME,
-        # Include Unicode characters to test for dividing a single
+            # Include Unicode characters to test for dividing a single
-        # character across multiple tokens: 🎉 is [28705, 31862] for the
+            # character across multiple tokens: 🎉 is [28705, 31862] for the
-        # Zephyr tokenizer
+            # Zephyr tokenizer
-        prompt="Say 'Hello, world! 🎉'",
+            prompt="Say 'Hello, world! 🎉'",
-        echo=True,
+            echo=True,
-        temperature=0,
+            temperature=0,
-        max_tokens=10,
+            max_tokens=10,
-        logprobs=1)
+            logprobs=1)
-    text = completion.choices[0].text
+        text = completion.choices[0].text
-    token_strs = completion.choices[0].logprobs.tokens
+        token_strs = completion.choices[0].logprobs.tokens
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # Check that the token representations are consistent between raw tokens
+        # Check that the token representations are consistent between raw
-    # and top_logprobs
+        # tokens and top_logprobs
-    # Slice off the first one, because there's no scoring associated with BOS
+        # Slice off the first one, because there's no scoring associated
-    top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
+        # with BOS
-    top_logprob_keys = [
+        top_logprobs = completion.choices[0].logprobs.top_logprobs[1:]
-        next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
+        top_logprob_keys = [
-    ]
+            next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs
-    assert token_strs[1:] == top_logprob_keys
+        ]
+        assert token_strs[1:] == top_logprob_keys
-    # Check that decoding the tokens gives the expected text
+        # Check that decoding the tokens gives the expected text
-    tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
+        tokens = [int(token.removeprefix("token_id:")) for token in token_strs]
-    assert text == tokenizer.decode(tokens, skip_special_tokens=True)
+        assert text == tokenizer.decode(tokens, skip_special_tokens=True)
 @pytest.mark.asyncio
 async def test_chat_return_tokens_as_token_ids_completion(
        server_with_return_tokens_as_token_ids_flag):
-    client = server_with_return_tokens_as_token_ids_flag.get_async_client()
+    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
-    response = await client.chat.completions.create(
+    ) as client:
-        model=MODEL_NAME,
+        response = await client.chat.completions.create(
-        # Include Unicode characters to test for dividing a single
+            model=MODEL_NAME,
-        # character across multiple tokens: 🎉 is [28705, 31862] for the
+            # Include Unicode characters to test for dividing a single
-        # Zephyr tokenizer
+            # character across multiple tokens: 🎉 is [28705, 31862] for the
-        messages=[{
+            # Zephyr tokenizer
-            "role": "system",
+            messages=[{
-            "content": "You like to respond in only emojis, like 🎉"
+                "role": "system",
-        }, {
+                "content": "You like to respond in only emojis, like 🎉"
-            "role": "user",
+            }, {
-            "content": "Please write some emojis: 🐱🐶🎉"
+                "role": "user",
-        }],
+                "content": "Please write some emojis: 🐱🐶🎉"
-        temperature=0,
+            }],
-        max_tokens=8,
+            temperature=0,
-        logprobs=True)
+            max_tokens=8,
+            logprobs=True)
-    text = response.choices[0].message.content
+        text = response.choices[0].message.content
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    token_ids = []
+        token_ids = []
-    for logprob_content in response.choices[0].logprobs.content:
+        for logprob_content in response.choices[0].logprobs.content:
-        token_ids.append(int(logprob_content.token.removeprefix("token_id:")))
+            token_ids.append(
-    assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
+                int(logprob_content.token.removeprefix("token_id:")))
+        assert tokenizer.decode(token_ids, skip_special_tokens=True) == text
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -7,13 +7,41 @@ from vllm.entrypoints.openai.protocol import BatchRequestOutput
 # ruff: noqa: E501
 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
+{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
-def test_e2e():
+def test_empty_file():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write("")
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
+            input_file.name, "-o", output_file.name, "--model",
+            "intfloat/e5-mistral-7b-instruct"
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+        contents = output_file.read()
+        assert contents.strip() == ""
+def test_completions():
    with tempfile.NamedTemporaryFile(
            "w") as input_file, tempfile.NamedTemporaryFile(
                "r") as output_file:
@@ -35,7 +63,7 @@ def test_e2e():
            BatchRequestOutput.model_validate_json(line)
-def test_e2e_invalid_input():
+def test_completions_invalid_input():
    """
    Ensure that we fail when the input doesn't conform to the openai api.
    """
@@ -52,3 +80,25 @@ def test_e2e_invalid_input():
        proc.communicate()
        proc.wait()
        assert proc.returncode != 0, f"{proc=}"
+def test_embeddings():
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(INPUT_EMBEDDING_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
+            input_file.name, "-o", output_file.name, "--model",
+            "intfloat/e5-mistral-7b-instruct"
+        ], )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            # Ensure that the output format conforms to the openai api.
+            # Validation should throw if the schema is wrong.
+            BatchRequestOutput.model_validate_json(line)
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -3,13 +3,21 @@ from contextlib import suppress
 from dataclasses import dataclass
 from unittest.mock import MagicMock
-from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.config import MultiModalConfig
+from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_engine import BaseModelPath
 from vllm.transformers_utils.tokenizer import get_tokenizer
 MODEL_NAME = "openai-community/gpt2"
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+@dataclass
+class MockHFConfig:
+    model_type: str = "any"
 @dataclass
@@ -20,6 +28,8 @@ class MockModelConfig:
    max_model_len = 100
    tokenizer_revision = None
    embedding_mode = False
+    multimodal_config = MultiModalConfig()
+    hf_config = MockHFConfig()
 @dataclass
@@ -35,7 +45,7 @@ async def _async_serving_chat_init():
    serving_completion = OpenAIServingChat(engine,
                                           model_config,
-                                           served_model_names=[MODEL_NAME],
+                                           BASE_MODEL_PATHS,
                                           response_role="assistant",
                                           chat_template=CHAT_TEMPLATE,
                                           lora_modules=None,
@@ -50,12 +60,13 @@ def test_async_serving_chat_init():
 def test_serving_chat_should_set_correct_max_tokens():
-    mock_engine = MagicMock(spec=AsyncLLMEngine)
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
    serving_chat = OpenAIServingChat(mock_engine,
                                     MockModelConfig(),
-                                     served_model_names=[MODEL_NAME],
+                                     BASE_MODEL_PATHS,
                                     response_role="assistant",
                                     chat_template=CHAT_TEMPLATE,
                                     lora_modules=None,
@@ -73,7 +84,6 @@ def test_serving_chat_should_set_correct_max_tokens():
    with suppress(Exception):
        asyncio.run(serving_chat.create_chat_completion(req))
-    # AsyncLLMEngine.generate(inputs, sampling_params, ...)
    assert mock_engine.generate.call_args.args[1].max_tokens == 93
    req.max_tokens = 10

--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
+from http import HTTPStatus
+from unittest.mock import MagicMock
+import pytest
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              LoadLoraAdapterRequest,
+                                              UnloadLoraAdapterRequest)
+from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+MODEL_NAME = "meta-llama/Llama-2-7b"
+BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+LORA_LOADING_SUCCESS_MESSAGE = (
+    "Success: LoRA adapter '{lora_name}' added successfully.")
+LORA_UNLOADING_SUCCESS_MESSAGE = (
+    "Success: LoRA adapter '{lora_name}' removed successfully.")
+async def _async_serving_engine_init():
+    mock_engine_client = MagicMock(spec=EngineClient)
+    mock_model_config = MagicMock(spec=ModelConfig)
+    # Set the max_model_len attribute to avoid missing attribute
+    mock_model_config.max_model_len = 2048
+    serving_engine = OpenAIServing(mock_engine_client,
+                                   mock_model_config,
+                                   BASE_MODEL_PATHS,
+                                   lora_modules=None,
+                                   prompt_adapters=None,
+                                   request_logger=None)
+    return serving_engine
+@pytest.mark.asyncio
+async def test_load_lora_adapter_success():
+    serving_engine = await _async_serving_engine_init()
+    request = LoadLoraAdapterRequest(lora_name="adapter",
+                                     lora_path="/path/to/adapter2")
+    response = await serving_engine.load_lora_adapter(request)
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
+    assert len(serving_engine.lora_requests) == 1
+    assert serving_engine.lora_requests[0].lora_name == "adapter"
+@pytest.mark.asyncio
+async def test_load_lora_adapter_missing_fields():
+    serving_engine = await _async_serving_engine_init()
+    request = LoadLoraAdapterRequest(lora_name="", lora_path="")
+    response = await serving_engine.load_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.type == "InvalidUserInput"
+    assert response.code == HTTPStatus.BAD_REQUEST
+@pytest.mark.asyncio
+async def test_load_lora_adapter_duplicate():
+    serving_engine = await _async_serving_engine_init()
+    request = LoadLoraAdapterRequest(lora_name="adapter1",
+                                     lora_path="/path/to/adapter1")
+    response = await serving_engine.load_lora_adapter(request)
+    assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
+        lora_name='adapter1')
+    assert len(serving_engine.lora_requests) == 1
+    request = LoadLoraAdapterRequest(lora_name="adapter1",
+                                     lora_path="/path/to/adapter1")
+    response = await serving_engine.load_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.type == "InvalidUserInput"
+    assert response.code == HTTPStatus.BAD_REQUEST
+    assert len(serving_engine.lora_requests) == 1
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_success():
+    serving_engine = await _async_serving_engine_init()
+    request = LoadLoraAdapterRequest(lora_name="adapter1",
+                                     lora_path="/path/to/adapter1")
+    response = await serving_engine.load_lora_adapter(request)
+    assert len(serving_engine.lora_requests) == 1
+    request = UnloadLoraAdapterRequest(lora_name="adapter1")
+    response = await serving_engine.unload_lora_adapter(request)
+    assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
+        lora_name='adapter1')
+    assert len(serving_engine.lora_requests) == 0
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_missing_fields():
+    serving_engine = await _async_serving_engine_init()
+    request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
+    response = await serving_engine.unload_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.type == "InvalidUserInput"
+    assert response.code == HTTPStatus.BAD_REQUEST
+@pytest.mark.asyncio
+async def test_unload_lora_adapter_not_found():
+    serving_engine = await _async_serving_engine_init()
+    request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
+    response = await serving_engine.unload_lora_adapter(request)
+    assert isinstance(response, ErrorResponse)
+    assert response.type == "InvalidUserInput"
+    assert response.code == HTTPStatus.BAD_REQUEST
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
+import json
+import os
+import openai
+import pytest
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+@pytest.mark.asyncio
+async def test_shutdown_on_engine_failure(tmp_path):
+    # Use a bad adapter to crash the engine
+    # (This test will fail when that bug is fixed)
+    adapter_path = tmp_path / "bad_adapter"
+    os.mkdir(adapter_path)
+    with open(adapter_path / "adapter_model_config.json", "w") as f:
+        json.dump({"not": "real"}, f)
+    with open(adapter_path / "adapter_model.safetensors", "wb") as f:
+        f.write(b"this is fake")
+    # dtype, max-len etc set so that this can run in CI
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        "--enable-lora",
+        "--lora-modules",
+        f"bad-adapter={tmp_path / 'bad_adapter'}",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        async with remote_server.get_async_client() as client:
+            with pytest.raises(
+                (openai.APIConnectionError, openai.InternalServerError)):
+                # This crashes the engine
+                await client.completions.create(model="bad-adapter",
+                                                prompt="Hello, my name is")
+            # Now the server should shut down
+            return_code = remote_server.proc.wait(timeout=8)
+            assert return_code is not None
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
 import openai  # use the official client for correctness check
 import pytest
+import pytest_asyncio
 import requests
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -42,9 +43,10 @@ def tokenizer_name(model_name: str,
        model_name == "zephyr-lora2") else model_name
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.mark.asyncio
@@ -102,28 +104,40 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
                "role": "user",
                "content": "Can I ask a question? vllm1"
            }]
+            for continue_final in [False, True]:
-            prompt = tokenizer.apply_chat_template(
+                if add_generation and continue_final:
-                add_generation_prompt=add_generation,
+                    continue
-                conversation=conversation,
+                if continue_final:
-                tokenize=False)
+                    conversation.append({
-            tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+                        "role": "assistant",
+                        "content": "Sure,"
-            response = requests.post(base_url + "/tokenize",
+                    })
-                                     json={
-                                         "add_generation_prompt":
+                prompt = tokenizer.apply_chat_template(
-                                         add_generation,
+                    add_generation_prompt=add_generation,
-                                         "add_special_tokens": add_special,
+                    continue_final_message=continue_final,
-                                         "messages": conversation,
+                    conversation=conversation,
-                                         "model": model_name
+                    tokenize=False)
-                                     })
+                tokens = tokenizer.encode(prompt,
-            response.raise_for_status()
+                                          add_special_tokens=add_special)
-            assert response.json() == {
+                response = requests.post(base_url + "/tokenize",
-                "tokens": tokens,
+                                         json={
-                "count": len(tokens),
+                                             "add_generation_prompt":
-                "max_model_len": 8192
+                                             add_generation,
-            }
+                                             "continue_final_message":
+                                             continue_final,
+                                             "add_special_tokens": add_special,
+                                             "messages": conversation,
+                                             "model": model_name
+                                         })
+                response.raise_for_status()
+                assert response.json() == {
+                    "tokens": tokens,
+                    "count": len(tokens),
+                    "max_model_len": 8192
+                }
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -2,14 +2,14 @@ from typing import Dict, List
 import openai
 import pytest
+import pytest_asyncio
 from vllm.multimodal.utils import encode_image_base64, fetch_image
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
-MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
-LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja"
+MAXIMUM_IMAGES = 2
-assert LLAVA_CHAT_TEMPLATE.exists()
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
@@ -26,19 +26,23 @@ def server():
        "--dtype",
        "bfloat16",
        "--max-model-len",
-        "4096",
+        "2048",
+        "--max-num-seqs",
+        "5",
        "--enforce-eager",
-        "--chat-template",
+        "--trust-remote-code",
-        str(LLAVA_CHAT_TEMPLATE),
+        "--limit-mm-per-prompt",
+        f"image={MAXIMUM_IMAGES}",
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
-@pytest.fixture(scope="module")
+@pytest_asyncio.fixture
-def client(server):
+async def client(server):
-    return server.get_async_client()
+    async with server.get_async_client() as async_client:
+        yield async_client
 @pytest.fixture(scope="session")
@@ -82,7 +86,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+        completion_tokens=10, prompt_tokens=772, total_tokens=782)
    message = choice.message
    message = chat_completion.choices[0].message
@@ -137,7 +141,7 @@ async def test_single_chat_session_image_base64encoded(
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+        completion_tokens=10, prompt_tokens=772, total_tokens=782)
    message = choice.message
    message = chat_completion.choices[0].message
@@ -215,26 +219,22 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_url: str):
+                                 image_urls: List[str]):
    messages = [{
        "role":
        "user",
        "content": [
-            {
+            *({
                "type": "image_url",
                "image_url": {
                    "url": image_url
                }
-            },
+            } for image_url in image_urls),
-            {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
-            },
            {
                "type": "text",
                "text": "What's in this image?"
@@ -242,20 +242,30 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
        ],
    }]
-    with pytest.raises(openai.BadRequestError):  # test multi-image input
+    if len(image_urls) > MAXIMUM_IMAGES:
-        await client.chat.completions.create(
+        with pytest.raises(openai.BadRequestError):  # test multi-image input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_tokens=10,
+                temperature=0.0,
+            )
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_tokens=10,
            temperature=0.0,
        )
+        message = chat_completion.choices[0].message
-    # the server should still work afterwards
+        assert message.content is not None and len(message.content) >= 0
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    completion = completion.choices[0].text
-    assert completion is not None and len(completion) >= 0
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
+import warnings
+from typing import Optional
+import pytest
+from PIL import Image
+from vllm.assets.image import ImageAsset
+from vllm.config import ModelConfig
+from vllm.entrypoints.chat_utils import (parse_chat_messages,
+                                         parse_chat_messages_futures)
+from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.utils import encode_image_base64
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+@pytest.fixture(scope="module")
+def phi3v_model_config():
+    return ModelConfig(PHI3V_MODEL_ID,
+                       PHI3V_MODEL_ID,
+                       tokenizer_mode="auto",
+                       trust_remote_code=True,
+                       dtype="bfloat16",
+                       seed=0,
+                       limit_mm_per_prompt={
+                           "image": 2,
+                       })
+@pytest.fixture(scope="module")
+def phi3v_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=PHI3V_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+@pytest.fixture(scope="module")
+def image_url():
+    image = ImageAsset('cherry_blossom')
+    base64 = encode_image_base64(image.pil_image)
+    return f"data:image/jpeg;base64,{base64}"
+def _assert_mm_data_is_image_input(
+    mm_data: Optional[MultiModalDataDict],
+    image_count: int,
+) -> None:
+    assert mm_data is not None
+    assert set(mm_data.keys()) == {"image"}
+    image_data = mm_data.get("image")
+    assert image_data is not None
+    if image_count == 1:
+        assert isinstance(image_data, Image.Image)
+    else:
+        assert isinstance(image_data, list) and len(image_data) == image_count
+def test_parse_chat_messages_single_image(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in the image?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1)
+@pytest.mark.asyncio
+async def test_parse_chat_messages_single_image_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in the image?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 1)
+def test_parse_chat_messages_multiple_images(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+def test_parse_chat_messages_placeholder_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type":
+            "text",
+            "text":
+            "What's in <|image_1|> and how does it compare to <|image_2|>?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "What's in <|image_1|> and how does it compare to <|image_2|>?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+def test_parse_chat_messages_placeholder_one_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type":
+            "text",
+            "text":
+            "What's in <|image_1|> and how does it compare to the other one?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
+        "other one?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+def test_parse_chat_messages_multiple_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in this image?"
+        }]
+    }, {
+        "role": "assistant",
+        "content": "Some stuff."
+    }, {
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What about this one?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?"
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat about this one?"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+def test_parse_chat_messages_rejects_too_many_images_in_one_message(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="coroutine 'async_get_and_parse_image' was never awaited")
+        with pytest.raises(
+                ValueError,
+                match="At most 2 image\\(s\\) may be provided in one request\\."
+        ):
+            parse_chat_messages([{
+                "role":
+                "user",
+                "content": [{
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "text",
+                    "text": "What's in these images?"
+                }]
+            }], phi3v_model_config, phi3v_tokenizer)
+def test_parse_chat_messages_rejects_too_many_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="coroutine 'async_get_and_parse_image' was never awaited")
+        with pytest.raises(
+                ValueError,
+                match="At most 2 image\\(s\\) may be provided in one request\\."
+        ):
+            parse_chat_messages([{
+                "role":
+                "user",
+                "content": [{
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "text",
+                    "text": "What's in this image?"
+                }]
+            }, {
+                "role": "assistant",
+                "content": "Some stuff."
+            }, {
+                "role":
+                "user",
+                "content": [{
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }, {
+                    "type": "text",
+                    "text": "What about these two?"
+                }]
+            }], phi3v_model_config, phi3v_tokenizer)
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -2,6 +2,13 @@ from typing import Optional, Tuple, Union
 import torch
+from vllm.utils import is_hip
+# Using the default value (240.0) from pytorch will cause accuracy
+# issue on dynamic quantization models. Here use 224.0 for rocm.
+ROCM_FP8_MAX = 224.0
+FP8_DTYPE = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn
 def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
@@ -11,13 +18,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
                                scale_ub: Optional[torch.tensor] = None) \
        -> Tuple[torch.tensor, torch.tensor]:
-    assert quant_dtype in [torch.int8, torch.float8_e4m3fn]
+    assert quant_dtype in [torch.int8, FP8_DTYPE]
    if scale_ub is not None:
-        assert quant_dtype == torch.float8_e4m3fn
+        assert quant_dtype == FP8_DTYPE
    qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
            else torch.finfo(quant_dtype)
-    qtype_max = as_float32_tensor(qtype_traits.max)
+    qtype_traits_max = ROCM_FP8_MAX if is_hip() else qtype_traits.max
+    qtype_traits_min = -ROCM_FP8_MAX if is_hip() else qtype_traits.min
+    qtype_max = as_float32_tensor(qtype_traits_max)
    s_1 = as_float32_tensor(1.0)
    s_512 = as_float32_tensor(512.0)
@@ -37,15 +46,15 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
        iscales = as_float32_tensor(s_1 / scales)
        torch_out = as_float32_tensor(x) * iscales
        torch_out = torch_out.round()
-        torch_out = torch_out.clamp(qtype_traits.min,
+        torch_out = torch_out.clamp(qtype_traits_min,
-                                    qtype_traits.max).to(quant_dtype)
+                                    qtype_traits_max).to(quant_dtype)
    else:
-        assert quant_dtype == torch.float8_e4m3fn
+        assert quant_dtype == FP8_DTYPE
        min_scaling_factor = s_1 / (qtype_max * s_512)
        scales = scales.clamp(min=min_scaling_factor)
        torch_out = as_float32_tensor(x) / scales
-        torch_out = torch_out.clamp(qtype_traits.min,
+        torch_out = torch_out.clamp(qtype_traits_min,
-                                    qtype_traits.max).to(quant_dtype)
+                                    qtype_traits_max).to(quant_dtype)
    return torch_out, scales
@@ -56,8 +65,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
 def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
                    -> Tuple[torch.tensor, torch.tensor]:
-    fp8_traits = torch.finfo(torch.float8_e4m3fn)
+    fp8_traits = torch.finfo(FP8_DTYPE)
-    fp8_max = as_float32_tensor(fp8_traits.max)
+    fp8_traits_max = ROCM_FP8_MAX if is_hip() else fp8_traits.max
+    fp8_traits_min = -ROCM_FP8_MAX if is_hip() else fp8_traits.min
+    fp8_max = as_float32_tensor(fp8_traits_max)
    one = as_float32_tensor(1.0)
    # For fp8, in order to match the cuda kernel output, we have to do exactly
@@ -68,5 +79,5 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
    ref_scale = x_max / fp8_max
    ref_iscale = one / ref_scale
    ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
-        fp8_traits.min, fp8_traits.max).to(dtype=torch.float8_e4m3fn)
+        fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
-    return ref_out, ref_scale
+    return ref_out, ref_scale.view((1, ))
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -3,8 +3,11 @@ from typing import Type
 import pytest
 import torch
+from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
-                                                   NewGELU, SiluAndMul)
+                                                   NewGELU, QuickGELU,
+                                                   SiluAndMul)
+from vllm.utils import seed_everything
 from .allclose_default import get_default_atol, get_default_rtol
@@ -32,25 +35,33 @@ def test_act_and_mul(
    seed: int,
    device: str,
 ) -> None:
-    torch.random.manual_seed(seed)
+    seed_everything(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    x = torch.randn(num_tokens, 2 * d, dtype=dtype)
    if activation == "silu":
        layer = SiluAndMul()
+        fn = torch.ops._C.silu_and_mul
    elif activation == "gelu":
        layer = GeluAndMul(approximate="none")
+        fn = torch.ops._C.gelu_and_mul
    elif activation == "gelu_tanh":
        layer = GeluAndMul(approximate="tanh")
+        fn = torch.ops._C.gelu_tanh_and_mul
    out = layer(x)
    ref_out = layer.forward_native(x)
    # The SiLU and GELU implementations are equivalent to the native PyTorch
    # implementations, so we can do exact comparison.
-    assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
+    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+    d = x.shape[-1] // 2
+    output_shape = (x.shape[:-1] + (d, ))
+    out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+    opcheck(fn, (out, x))
-@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
+@pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast),
+                                        (NewGELU, torch.ops._C.gelu_new),
+                                        (QuickGELU, torch.ops._C.gelu_quick)])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -65,15 +76,17 @@ def test_activation(
    seed: int,
    device: str,
 ) -> None:
-    torch.random.manual_seed(seed)
+    seed_everything(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    x = torch.randn(num_tokens, d, dtype=dtype)
-    layer = activation()
+    layer = activation[0]()
+    fn = activation[1]
    out = layer(x)
    ref_out = layer.forward_native(x)
-    assert torch.allclose(out,
+    torch.testing.assert_close(out,
-                          ref_out,
+                               ref_out,
-                          atol=get_default_atol(out),
+                               atol=get_default_atol(out),
-                          rtol=get_default_rtol(out))
+                               rtol=get_default_rtol(out))
+    out = torch.empty_like(x)
+    opcheck(fn, (out, x))
--- a/tests/kernels/test_aqlm.py
+++ b/tests/kernels/test_aqlm.py
+import torch
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops  # noqa: F401
+def test_aqlm_dequant_opcheck():
+    codes = torch.randint(-32768,
+                          32767, (22016, 512, 1),
+                          device='cuda',
+                          dtype=torch.int16)
+    codebooks = torch.rand((2, 65536, 1, 8),
+                           device='cuda',
+                           dtype=torch.float16)
+    codebook_partition_sizes = [11008, 11008]
+    opcheck(torch.ops._C.aqlm_dequant,
+            (codes, codebooks, codebook_partition_sizes))
+def test_aqlm_gemm_opcheck():
+    input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
+    codes = torch.randint(-32768,
+                          32767, (12288, 512, 1),
+                          device='cuda',
+                          dtype=torch.int16)
+    codebooks = torch.rand((3, 65536, 1, 8),
+                           device='cuda',
+                           dtype=torch.float16)
+    scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
+    codebook_partition_sizes = [4096, 4096, 4096]
+    bias = None
+    opcheck(torch.ops._C.aqlm_gemm,
+            (input, codes, codebooks, scales, codebook_partition_sizes, None))
+    opcheck(torch.ops._C.aqlm_gemm,
+            (input, codes, codebooks, scales, codebook_partition_sizes, bias))
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -3,14 +3,17 @@ from typing import List, Optional, Tuple
 import pytest
 import torch
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
-from vllm.utils import get_max_shared_memory_bytes, is_hip
+from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything
 from .allclose_default import get_default_atol, get_default_rtol
+if not is_hip():
+    from xformers import ops as xops
+    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -29,8 +32,7 @@ NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 # FlashAttention forward only supports head dimension at most 128
 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256
+HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
-              ] if not is_hip() else [64, 80, 96, 112, 128]
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
@@ -112,7 +114,8 @@ def ref_single_query_cached_kv_attention(
        output[i].copy_(out, non_blocking=True)
-@pytest.mark.parametrize("version", ["v1", "v2"])
+@pytest.mark.parametrize(
+    "version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"])
 @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -135,12 +138,11 @@ def test_paged_attention(
    seed: int,
    device: str,
 ) -> None:
-    if kv_cache_dtype == "fp8" and head_size % 16:
+    if ((kv_cache_dtype == "fp8" and head_size % 16)
+            or (version == "rocm" and head_size not in (64, 128))):
        pytest.skip()
-    random.seed(seed)
-    torch.random.manual_seed(seed)
+    seed_everything(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    scale = float(1.0 / (head_size**0.5))
    num_query_heads, num_kv_heads = num_heads
@@ -199,7 +201,15 @@ def test_paged_attention(
            k_scale,
            v_scale,
        )
-    elif version == "v2":
+        opcheck(torch.ops._C.paged_attention_v1,
+                (output, query, key_cache, value_cache, num_kv_heads, scale,
+                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
+                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                cond=(head_size == HEAD_SIZES[0]
+                      and block_size == BLOCK_SIZES[0]))
+    elif version in ("v2", "rocm"):
        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
        assert PARTITION_SIZE % block_size == 0
        num_seqs, num_heads, head_size = output.shape
@@ -212,25 +222,64 @@ def test_paged_attention(
            dtype=torch.float32,
        )
        max_logits = torch.empty_like(exp_sums)
-        ops.paged_attention_v2(
+        if version == "v2":
-            output,
+            ops.paged_attention_v2(
-            exp_sums,
+                output,
-            max_logits,
+                exp_sums,
-            tmp_output,
+                max_logits,
-            query,
+                tmp_output,
-            key_cache,
+                query,
-            value_cache,
+                key_cache,
-            num_kv_heads,
+                value_cache,
-            scale,
+                num_kv_heads,
-            block_tables,
+                scale,
-            seq_lens,
+                block_tables,
-            block_size,
+                seq_lens,
-            max_seq_len,
+                block_size,
-            alibi_slopes,
+                max_seq_len,
-            kv_cache_dtype,
+                alibi_slopes,
-            k_scale,
+                kv_cache_dtype,
-            v_scale,
+                k_scale,
-        )
+                v_scale,
+            )
+            opcheck(torch.ops._C.paged_attention_v2,
+                    (output, exp_sums, max_logits, tmp_output, query,
+                     key_cache, value_cache, num_kv_heads, scale, block_tables,
+                     seq_lens, block_size, max_seq_len, alibi_slopes,
+                     kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                    cond=(head_size == HEAD_SIZES[0]
+                          and block_size == BLOCK_SIZES[0]))
+        else:
+            ops.paged_attention_rocm(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                block_tables,
+                seq_lens,
+                block_size,
+                max_seq_len,
+                alibi_slopes,
+                kv_cache_dtype,
+                k_scale,
+                v_scale,
+            )
+            opcheck(torch.ops._rocm_C.paged_attention,
+                    (output, exp_sums, max_logits, tmp_output, query,
+                     key_cache, value_cache, num_kv_heads, scale, block_tables,
+                     seq_lens, block_size, max_seq_len, alibi_slopes,
+                     kv_cache_dtype, k_scale, v_scale),
+                    cond=(head_size == HEAD_SIZES[0]
+                          and block_size == BLOCK_SIZES[0]))
    else:
        raise AssertionError(f"Unknown version: {version}")
@@ -277,7 +326,7 @@ def test_paged_attention(
    atol, rtol = 1e-3, 1e-5
    if kv_cache_dtype == "fp8":
        atol, rtol = 1e-2, 1e-5
-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
 def ref_multi_query_kv_attention(
@@ -320,6 +369,8 @@ def ref_multi_query_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.skipif(is_hip(),
+                    reason="Xformers backend is not supported on ROCm.")
 @torch.inference_mode()
 def test_multi_query_kv_attention(
    num_seqs: int,
@@ -329,10 +380,7 @@ def test_multi_query_kv_attention(
    seed: int,
    device: str,
 ) -> None:
-    random.seed(seed)
+    seed_everything(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
    torch.set_default_device(device)
    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
    # As the xformers library is already tested with its own tests, we can use
@@ -382,4 +430,4 @@ def test_multi_query_kv_attention(
    )
    atol = get_default_atol(output) if is_hip() else 1e-3
    rtol = get_default_rtol(output) if is_hip() else 1e-5
-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -3,9 +3,9 @@ from unittest.mock import patch
 import pytest
 import torch
-from tests.kernels.utils import (STR_FLASH_ATTN_VAL, STR_INVALID_VAL,
+from tests.kernels.utils import override_backend_env_variable
-                                 override_backend_env_variable)
 from vllm.attention.selector import which_attn_to_use
+from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 @pytest.mark.parametrize(
@@ -20,22 +20,22 @@ def test_env(name: str, device: str, monkeypatch):
    if device == "cpu":
        with patch("vllm.attention.selector.is_cpu", return_value=True):
-            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
+            backend = which_attn_to_use(16, None, torch.float16, torch.float16,
-                                        torch.float16, 16)
+                                        16, False)
        assert backend.name == "TORCH_SDPA"
    elif device == "hip":
        with patch("vllm.attention.selector.is_hip", return_value=True):
-            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
+            backend = which_attn_to_use(16, None, torch.float16, torch.float16,
-                                        torch.float16, 16)
+                                        16, False)
        assert backend.name == "ROCM_FLASH"
    elif device == "openvino":
        with patch("vllm.attention.selector.is_openvino", return_value=True):
-            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
+            backend = which_attn_to_use(16, None, torch.float16, torch.float16,
-                                        torch.float16, 16)
+                                        16, False)
        assert backend.name == "OPENVINO"
    else:
-        backend = which_attn_to_use(8, 16, 8, None, torch.float16,
+        backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16,
-                                    torch.float16, 16)
+                                    False)
        assert backend.name == name
@@ -45,33 +45,38 @@ def test_flash_attn(monkeypatch):
    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
    # Unsupported CUDA arch
-    with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
+    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
-        backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
+        backend = which_attn_to_use(16, None, torch.float16, None, 16, False)
        assert backend.name != STR_FLASH_ATTN_VAL
    # Unsupported data type
-    backend = which_attn_to_use(8, 16, 8, None, torch.float8_e4m3fn, None, 16)
+    backend = which_attn_to_use(16, None, torch.float8_e4m3fn, None, 16, False)
    assert backend.name != STR_FLASH_ATTN_VAL
    # Unsupported kv cache data type
-    backend = which_attn_to_use(8, 16, 8, None, torch.float16, "fp8", 16)
+    backend = which_attn_to_use(16, None, torch.float16, "fp8", 16, False)
    assert backend.name != STR_FLASH_ATTN_VAL
    # Unsupported block size
-    backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 8)
+    backend = which_attn_to_use(16, None, torch.float16, None, 8, False)
    assert backend.name != STR_FLASH_ATTN_VAL
    # Unsupported sliding window
-    backend = which_attn_to_use(8, 16, 8, 1, torch.float16, None, 16)
+    backend = which_attn_to_use(16, 1, torch.float16, None, 16, False)
    assert backend.name != STR_FLASH_ATTN_VAL
    # flash-attn is not installed
    with patch.dict('sys.modules', {'vllm_flash_attn': None}):
-        backend = which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
+        backend = which_attn_to_use(16, None, torch.float16, None, 16, False)
        assert backend.name != STR_FLASH_ATTN_VAL
    # Unsupported head size
-    backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
+    backend = which_attn_to_use(17, None, torch.float16, None, 16, False)
+    assert backend.name != STR_FLASH_ATTN_VAL
+    # Attention-free models should bypass env and use PlaceholderAttention
+    backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16,
+                                True)
    assert backend.name != STR_FLASH_ATTN_VAL
@@ -79,4 +84,4 @@ def test_invalid_env(monkeypatch):
    """Throw an exception if the backend name is invalid."""
    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
    with pytest.raises(ValueError):
-        which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
+        which_attn_to_use(16, None, torch.float16, None, 16, False)
\ No newline at end of file
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
+import os
+import pytest
+import torch
+from tests.kernels.utils import opcheck
+from vllm import _custom_ops as ops  # noqa: F401
+@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
+                    reason="AWQ is not supported on this GPU type.")
+def test_awq_dequantize_opcheck():
+    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
+    qweight = torch.randint(-2000000000,
+                            2000000000, (8192, 256),
+                            device='cuda',
+                            dtype=torch.int32)
+    scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
+    zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+    split_k_iters = 0
+    thx = 0
+    thy = 0
+    opcheck(torch.ops._C.awq_dequantize,
+            (qweight, scales, zeros, split_k_iters, thx, thy))
+@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
+                    reason="AWQ is not supported on this GPU type.")
+def test_awq_gemm_opcheck():
+    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
+    input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
+    qweight = torch.randint(-2000000000,
+                            2000000000, (8192, 256),
+                            device='cuda',
+                            dtype=torch.int32)
+    scales = torch.randint(-2000000000,
+                           2000000000, (64, 256),
+                           device='cuda',
+                           dtype=torch.int32)
+    qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+    split_k_iters = 8
+    opcheck(torch.ops._C.awq_gemm,
+            (input, qweight, qzeros, scales, split_k_iters))
--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/test_awq_marlin.py
+"""Test AWQ with fused MoE Marlin kernels.
+Run `pytest tests/kernels/test_awq_marlin.py`.
+"""
+import pytest
+import torch
+from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe,
+                                 torch_moe_single)
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+    fused_marlin_moe, single_marlin_moe)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    awq_marlin_quantize)
+from vllm.scalar_type import scalar_types
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+@pytest.mark.skipif(not (ops.supports_moe_ops
+                         and hasattr(torch.ops._moe_C, "marlin_gemm_moe")),
+                    reason="Marlin is not supported on this GPU type.")
+def test_fused_marlin_moe_awq(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+):
+    torch.manual_seed(7)
+    num_bits = 4
+    quant_type = scalar_types.uint4
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    w_ref1_l = []
+    qweights1_l = []
+    scales1_l = []
+    zp1_l = []
+    for i in range(w1.shape[0]):
+        w_ref1, qweight1, scales1, zp1 = awq_marlin_quantize(
+            w1[i].transpose(1, 0), quant_type, group_size)
+        w_ref1_l.append(w_ref1)
+        qweights1_l.append(qweight1)
+        scales1_l.append(scales1)
+        zp1_l.append(zp1)
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweights1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    zp1 = stack_and_dev(zp1_l)
+    w_ref2_l = []
+    qweights2_l = []
+    scales2_l = []
+    zp2_l = []
+    for i in range(w2.shape[0]):
+        w_ref2, qweight2, scales2, zp2 = awq_marlin_quantize(
+            w2[i].transpose(1, 0), quant_type, group_size)
+        w_ref2_l.append(w_ref2)
+        qweights2_l.append(qweight2)
+        scales2_l.append(scales2)
+        zp2_l.append(zp2)
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweights2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    zp2 = stack_and_dev(zp2_l)
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    topk_weights, topk_ids = fused_topk(a, score, topk, False)
+    marlin_output = fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        scales1,
+        scales2,
+        score,
+        topk_weights,
+        topk_ids,
+        w1_zeros=zp1,
+        w2_zeros=zp2,
+        num_bits=num_bits,
+    )
+    torch_output = torch_moe(
+        a,
+        w_ref1.transpose(1, 2),
+        w_ref2.transpose(1, 2),
+        score,
+        topk,
+    )
+    assert compute_max_diff(marlin_output, torch_output) < 4e-2
+@pytest.mark.skip("This test is here for the sake of debugging, "
+                  "don't run it in automated tests.")
+@pytest.mark.parametrize("m", [64, 512, 222, 33, 1])
+@pytest.mark.parametrize("n", [128, 2048, 256, 1024])
+@pytest.mark.parametrize("k", [128, 1024, 512])
+@pytest.mark.parametrize("e", [8, 64])
+@pytest.mark.parametrize("topk", [2, 6])
+@pytest.mark.parametrize("group_size", [-1, 32, 64, 128])
+def test_single_marlin_moe_multiply_awq(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    group_size: int,
+):
+    torch.manual_seed(7)
+    num_bits = 4
+    quant_type = scalar_types.uint4
+    dtype = torch.float16
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w = torch.randn((e, n, k), device="cuda", dtype=dtype) / 10
+    w_ref_l = []
+    qweights_l = []
+    scales_l = []
+    zp_l = []
+    for i in range(w.shape[0]):
+        w_ref, qweight, scales, zp = awq_marlin_quantize(
+            w[i].transpose(1, 0), quant_type, group_size)
+        w_ref_l.append(w_ref)
+        qweights_l.append(qweight)
+        scales_l.append(scales)
+        zp_l.append(zp)
+    w_ref = stack_and_dev(w_ref_l)
+    qweight = stack_and_dev(qweights_l).contiguous()
+    scales = stack_and_dev(scales_l).contiguous()
+    zp = stack_and_dev(zp_l).contiguous()
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    marlin_output = single_marlin_moe(a,
+                                      qweight,
+                                      scales,
+                                      score,
+                                      topk,
+                                      renormalize=False,
+                                      w_zeros=zp,
+                                      num_bits=num_bits)
+    torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk)
+    assert compute_max_diff(marlin_output, torch_output) < 1e-2
--- a/tests/kernels/test_awq_triton.py
+++ b/tests/kernels/test_awq_triton.py
+"""Tests for the AWQ Triton kernel.
+Run `pytest tests/kernels/test_awq_triton.py`.
+"""
+import pytest
+import torch
+from vllm.model_executor.layers.quantization.awq_triton import (
+    AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton)
+from vllm.utils import seed_everything
+device = "cuda"
+def reverse_awq_order(t: torch.Tensor):
+    bits = 4
+    AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
+    reverse_order_tensor = torch.arange(
+        t.shape[-1],
+        dtype=torch.int32,
+        device=t.device,
+    )
+    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
+    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
+    reverse_order_tensor = reverse_order_tensor.view(-1)
+    t = t[:, reverse_order_tensor] & 0xF
+    return t
+# qweights - [R     , C // 8], int32
+# scales   - [R // G, C     ], float16
+# zeros    - [R // G, C // 8], int32
+def awq_dequantize_torch(qweight: torch.Tensor, scales: torch.Tensor,
+                         qzeros: torch.Tensor,
+                         group_size: int) -> torch.Tensor:
+    if group_size == -1:
+        group_size = qweight.shape[0]
+    bits = 4
+    shifts = torch.arange(0, 32, bits, device=qzeros.device)
+    iweights = torch.bitwise_right_shift(qweight[:, :, None],
+                                         shifts[None, None, :]).to(torch.int8)
+    iweights = iweights.view(iweights.shape[0], -1)
+    zeros = torch.bitwise_right_shift(qzeros[:, :, None],
+                                      shifts[None, None, :]).to(torch.int8)
+    zeros = zeros.view(qzeros.shape[0], -1)
+    zeros = reverse_awq_order(zeros)
+    iweights = reverse_awq_order(iweights)
+    iweights = torch.bitwise_and(iweights, (2**bits) - 1)
+    zeros = torch.bitwise_and(zeros, (2**bits) - 1)
+    scales = scales.repeat_interleave(group_size, dim=0)
+    zeros = zeros.repeat_interleave(group_size, dim=0)
+    return (iweights - zeros) * scales
+# qweights - [R     , C // 8], int32
+# scales   - [R // G, C     ], float16
+# zeros    - [R // G, C // 8], int32
+@pytest.mark.parametrize("qweight_rows", [3584, 18944, 128, 256, 512, 1024])
+@pytest.mark.parametrize("qweight_cols", [448, 576, 4736, 16, 32, 64, 128])
+@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
+def test_dequantize(qweight_rows, qweight_cols, group_size):
+    if group_size == -1:
+        group_size = qweight_rows
+    qweight_dtype = torch.int32
+    scales_rows = qweight_rows // group_size
+    scales_cols = qweight_cols * 8
+    scales_dtype = torch.float16
+    zeros_rows = scales_rows
+    zeros_cols = qweight_cols
+    zeros_dtype = torch.int32
+    seed_everything(0)
+    qweight = torch.randint(0,
+                            torch.iinfo(torch.int32).max,
+                            (qweight_rows, qweight_cols),
+                            dtype=qweight_dtype,
+                            device=device)
+    scales = torch.rand(scales_rows,
+                        scales_cols,
+                        dtype=scales_dtype,
+                        device=device)
+    zeros = torch.randint(0,
+                          torch.iinfo(torch.int32).max,
+                          (zeros_rows, zeros_cols),
+                          dtype=zeros_dtype,
+                          device=device)
+    iweights_triton = awq_dequantize_triton(qweight, scales, zeros)
+    assert (not torch.any(torch.isinf(iweights_triton))
+            and not torch.any(torch.isnan(iweights_triton)))
+    iweights_torch = awq_dequantize_torch(qweight, scales, zeros, group_size)
+    torch.testing.assert_close(iweights_triton, iweights_torch)
+# input   - [N, K]
+# qweight - [K, M // 8]
+# qzeros  - [K // G, M // 8]
+# scales  - [K // G, M]
+@pytest.mark.parametrize("N", [1, 2, 4, 8, 14, 17, 23, 32])
+@pytest.mark.parametrize("K", [128])
+@pytest.mark.parametrize("M", [16, 24, 32])
+@pytest.mark.parametrize("group_size", AWQ_TRITON_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("splitK", [1, 8])
+def test_gemm(N, K, M, splitK, group_size):
+    if group_size == -1:
+        group_size = K
+    split_k_iters = splitK
+    input_rows = N
+    input_cols = K
+    input_dtype = torch.float32
+    qweight_rows = input_cols
+    qweight_cols = M // 8
+    scales_rows = qweight_rows // group_size
+    scales_cols = M
+    scales_dtype = torch.float32
+    qzeros_rows = scales_rows
+    qzeros_cols = qweight_cols
+    seed_everything(0)
+    input = torch.rand((input_rows, input_cols),
+                       dtype=input_dtype,
+                       device=device)
+    qweight = torch.randint(0,
+                            torch.iinfo(torch.int32).max,
+                            (qweight_rows, qweight_cols),
+                            device=device)
+    qzeros = torch.randint(0,
+                           torch.iinfo(torch.int32).max,
+                           (qzeros_rows, qzeros_cols),
+                           device=device)
+    scales = torch.rand((scales_rows, scales_cols),
+                        dtype=scales_dtype,
+                        device=device)
+    output_triton = awq_gemm_triton(input, qweight, scales, qzeros,
+                                    split_k_iters)
+    assert (not torch.any(torch.isinf(output_triton))
+            and not torch.any(torch.isnan(output_triton)))
+    dequantized_weights = awq_dequantize_triton(qweight, scales, qzeros)
+    output_torch = torch.matmul(input, dequantized_weights)
+    assert (not torch.any(torch.isinf(output_torch))
+            and not torch.any(torch.isnan(output_torch)))
+    torch.testing.assert_close(output_triton.cpu(),
+                               output_torch.cpu(),
+                               atol=1e-1,
+                               rtol=1e-1)