Merge tag 'v0.5.3.post1' into v0.5.3.post1-dtk24.04.1

500b93c8 · zhuwenwen · 99426767 · 38c4b7e8 · 500b93c8 · 500b93c8
Commit 500b93c8 authored Jul 25, 2024 by zhuwenwen
20 changed files
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
 import os
-import openai  # use the official client for correctness check
 import pytest
-from ..utils import RemoteOpenAIServer
+from ..utils import compare_two_settings
-# downloading lora to test lora requests
+VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
-# any model with a chat template should work here
-MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
-EAGER_MODE = bool(int(os.getenv("EAGER_MODE", 0)))
-CHUNKED_PREFILL = bool(int(os.getenv("CHUNKED_PREFILL", 0)))
-TP_SIZE = int(os.getenv("TP_SIZE", 1))
-PP_SIZE = int(os.getenv("PP_SIZE", 1))
-pytestmark = pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, DIST_BACKEND",
+    [
-@pytest.fixture(scope="module")
+        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-def server():
+        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-    args = [
+        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        "--model",
+        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"),
-        MODEL_NAME,
+        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"),
+        (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
+    ])
+def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
+                    DIST_BACKEND):
+    if VLLM_MULTI_NODE and DIST_BACKEND == "mp":
+        pytest.skip("Skipping multi-node pipeline parallel test for "
+                    "multiprocessing distributed backend")
+    pp_args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
-        "bfloat16",
+        "float16",
        "--pipeline-parallel-size",
        str(PP_SIZE),
        "--tensor-parallel-size",
        str(TP_SIZE),
        "--distributed-executor-backend",
-        "ray",
+        DIST_BACKEND,
+    ]
+    # compare without pipeline parallelism
+    # NOTE: use mp backend for TP
+    # PP tests might involve multiple nodes, and ray might
+    #  schedule all workers in a node other than the head node,
+    #  which can cause the test to fail.
+    tp_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--tensor-parallel-size",
+        str(max(TP_SIZE, 2)),  # We only use 2 GPUs in the CI.
+        "--distributed-executor-backend",
+        "mp",
    ]
    if CHUNKED_PREFILL:
-        args += [
+        pp_args.append("--enable-chunked-prefill")
-            "--enable-chunked-prefill",
+        tp_args.append("--enable-chunked-prefill")
-        ]
    if EAGER_MODE:
-        args += [
+        pp_args.append("--enforce-eager")
-            "--enforce-eager",
+        tp_args.append("--enforce-eager")
-        ]
-    with RemoteOpenAIServer(args) as remote_server:
-        yield remote_server
-@pytest.fixture(scope="module")
-def client(server):
-    return server.get_async_client()
-async def test_check_models(server, client: openai.AsyncOpenAI):
-    models = await client.models.list()
-    models = models.data
-    served_model = models[0]
-    assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_single_completion(server, client: openai.AsyncOpenAI,
-                                 model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
-    assert completion.choices[0].finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert completion.choices[0].text is not None and len(
-        completion.choices[0].text) >= 5
-@pytest.mark.parametrize(
-    # just test 1 lora hereafter
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_batch_completions(server, client: openai.AsyncOpenAI,
-                                 model_name: str):
-    # test simple list
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(batch.choices) == 2
-    assert batch.choices[0].text == batch.choices[1].text
-    # test n = 2
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        n=2,
-        max_tokens=5,
-        temperature=0.0,
-        extra_body=dict(
-            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
-            # for official client.
-            use_beam_search=True),
-    )
-    assert len(batch.choices) == 4
-    assert batch.choices[0].text != batch.choices[
-        1].text, "beam search should be different"
-    assert batch.choices[0].text == batch.choices[
-        2].text, "two copies of the same prompt should be the same"
-    assert batch.choices[1].text == batch.choices[
-        3].text, "two copies of the same prompt should be the same"
-    # test streaming
+    compare_two_settings(MODEL_NAME, pp_args, tp_args)
-    batch = await client.completions.create(
-        model=model_name,
-        prompt=["Hello, my name is", "Hello, my name is"],
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-    )
-    texts = [""] * 2
-    async for chunk in batch:
-        assert len(chunk.choices) == 1
-        choice = chunk.choices[0]
-        texts[choice.index] += choice.text
-    assert texts[0] == texts[1]
--- a/tests/engine/output_processor/test_stop_checker.py
+++ b/tests/engine/output_processor/test_stop_checker.py
@@ -35,8 +35,8 @@ def sequence_with_eos(text: str, eos_token: str,
 @pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
    ("This text ends with EOS token", "</s>", 2),
 ])
-@pytest.mark.parametrize("ignore_eos", [True, False, None])
+@pytest.mark.parametrize("ignore_eos", [True, False])
-@pytest.mark.parametrize("include_stop_str_in_output", [True, False, None])
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 @pytest.mark.skip_global_cleanup
 def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
                           ignore_eos: bool, include_stop_str_in_output: bool):

--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
+import asyncio
+import os
+import pytest
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
+from vllm.sampling_params import SamplingParams
+class Mock:
+    ...
+class CustomGPUExecutor(GPUExecutor):
+    def execute_model(self, *args, **kwargs):
+        # Drop marker to show that this was ran
+        with open(".marker", "w"):
+            ...
+        return super().execute_model(*args, **kwargs)
+class CustomGPUExecutorAsync(GPUExecutorAsync):
+    async def execute_model_async(self, *args, **kwargs):
+        with open(".marker", "w"):
+            ...
+        return await super().execute_model_async(*args, **kwargs)
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor_type_checking(model):
+    with pytest.raises(ValueError):
+        engine_args = EngineArgs(model=model,
+                                 distributed_executor_backend=Mock)
+        LLMEngine.from_engine_args(engine_args)
+    with pytest.raises(ValueError):
+        engine_args = AsyncEngineArgs(model=model,
+                                      distributed_executor_backend=Mock)
+        AsyncLLMEngine.from_engine_args(engine_args)
+    with pytest.raises(TypeError):
+        engine_args = AsyncEngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutor)
+        AsyncLLMEngine.from_engine_args(engine_args)
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor(model, tmpdir):
+    cwd = os.path.abspath(".")
+    os.chdir(tmpdir)
+    try:
+        assert not os.path.exists(".marker")
+        engine_args = EngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutor)
+        engine = LLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+        engine.add_request("0", "foo", sampling_params)
+        engine.step()
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_custom_executor_async(model, tmpdir):
+    cwd = os.path.abspath(".")
+    os.chdir(tmpdir)
+    try:
+        assert not os.path.exists(".marker")
+        engine_args = AsyncEngineArgs(
+            model=model, distributed_executor_backend=CustomGPUExecutorAsync)
+        engine = AsyncLLMEngine.from_engine_args(engine_args)
+        sampling_params = SamplingParams(max_tokens=1)
+        async def t():
+            stream = await engine.add_request("0", "foo", sampling_params)
+            async for x in stream:
+                ...
+        asyncio.run(t())
+        assert os.path.exists(".marker")
+    finally:
+        os.chdir(cwd)
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
+from http import HTTPStatus
+import openai
+import pytest
+import requests
+from vllm.version import __version__ as VLLM_VERSION
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+@pytest.mark.asyncio
+async def test_show_version(client: openai.AsyncOpenAI):
+    base_url = str(client.base_url)[:-3].strip("/")
+    response = requests.get(base_url + "/version")
+    response.raise_for_status()
+    assert response.json() == {"version": VLLM_VERSION}
+@pytest.mark.asyncio
+async def test_check_health(client: openai.AsyncOpenAI):
+    base_url = str(client.base_url)[:-3].strip("/")
+    response = requests.get(base_url + "/health")
+    assert response.status_code == HTTPStatus.OK
+@pytest.mark.asyncio
+async def test_log_metrics(client: openai.AsyncOpenAI):
+    base_url = str(client.base_url)[:-3].strip("/")
+    response = requests.get(base_url + "/metrics")
+    assert response.status_code == HTTPStatus.OK
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -7,11 +7,11 @@ import jsonschema
 import openai  # use the official client for correctness check
 import pytest
 import torch
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
 from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -21,33 +21,28 @@ LORA_NAME = "typeof/zephyr-7b-beta-lora"
 @pytest.fixture(scope="module")
-def zephyr_lora_files():
+def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
-    return snapshot_download(repo_id=LORA_NAME)
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
-@pytest.fixture(scope="module")
+        "bfloat16",
-def server(zephyr_lora_files):
+        "--max-model-len",
-    with RemoteOpenAIServer([
+        "8192",
-            "--model",
+        "--enforce-eager",
-            MODEL_NAME,
+        # lora config below
-            # use half precision for speed and memory savings in CI environment
+        "--enable-lora",
-            "--dtype",
+        "--lora-modules",
-            "bfloat16",
+        f"zephyr-lora={zephyr_lora_files}",
-            "--max-model-len",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
-            "8192",
+        "--max-lora-rank",
-            "--enforce-eager",
+        "64",
-            # lora config below
+        "--max-cpu-loras",
-            "--enable-lora",
+        "2",
-            "--lora-modules",
+        "--max-num-seqs",
-            f"zephyr-lora={zephyr_lora_files}",
+        "128",
-            f"zephyr-lora2={zephyr_lora_files}",
+    ]
-            "--max-lora-rank",
-            "64",
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            "--max-cpu-loras",
-            "2",
-            "--max-num-seqs",
-            "128",
-    ]) as remote_server:
        yield remote_server

--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
 # imports for guided decoding tests
 import json
 import re
+import shutil
+from tempfile import TemporaryDirectory
 from typing import List
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
-import requests
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
+from transformers import AutoTokenizer
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -17,9 +19,13 @@ from ...utils import RemoteOpenAIServer
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# technically these adapters use a different base model,
-# generation quality here
+# but we're not testing generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
+PA_NAME = "swapnilbp/llama_tweet_ptune"
+# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also
+# need to change to match the prompt adapter
+PA_NUM_VIRTUAL_TOKENS = 8
 @pytest.fixture(scope="module")
@@ -28,28 +34,58 @@ def zephyr_lora_files():
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):
+def zephyr_lora_added_tokens_files(zephyr_lora_files):
-    with RemoteOpenAIServer([
+    tmp_dir = TemporaryDirectory()
-            "--model",
+    tmp_model_dir = f"{tmp_dir.name}/zephyr"
-            MODEL_NAME,
+    shutil.copytree(zephyr_lora_files, tmp_model_dir)
-            # use half precision for speed and memory savings in CI environment
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-            "--dtype",
+    # Copy tokenizer to adapter and add some unique tokens
-            "bfloat16",
+    # 32000, 32001, 32002
-            "--max-model-len",
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
-            "8192",
+                                 special_tokens=True)
-            "--enforce-eager",
+    assert added == 3
-            # lora config below
+    tokenizer.save_pretrained(tmp_model_dir)
-            "--enable-lora",
+    yield tmp_model_dir
-            "--lora-modules",
+    tmp_dir.cleanup()
-            f"zephyr-lora={zephyr_lora_files}",
-            f"zephyr-lora2={zephyr_lora_files}",
-            "--max-lora-rank",
+@pytest.fixture(scope="module")
-            "64",
+def zephyr_pa_files():
-            "--max-cpu-loras",
+    return snapshot_download(repo_id=PA_NAME)
-            "2",
-            "--max-num-seqs",
-            "128",
+@pytest.fixture(scope="module")
-    ]) as remote_server:
+def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files):
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        # pa config
+        "--enable-prompt-adapter",
+        "--prompt-adapters",
+        f"zephyr-pa={zephyr_pa_files}",
+        f"zephyr-pa2={zephyr_pa_files}",
+        "--max-prompt-adapters",
+        "2",
+        "--max-prompt-adapter-token",
+        "128",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
@@ -60,11 +96,14 @@ def client(server):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # first test base model, then test loras
+    # first test base model, then test loras, then test prompt adapters
-    "model_name",
+    "model_name,num_virtual_tokens",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+    [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0),
+     ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS),
+     ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)],
 )
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str,
+                                 num_virtual_tokens: int):
    completion = await client.completions.create(model=model_name,
                                                 prompt="Hello, my name is",
                                                 max_tokens=5,
@@ -77,28 +116,58 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
    assert len(choice.text) >= 5
    assert choice.finish_reason == "length"
    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+        completion_tokens=5,
+        prompt_tokens=6 + num_virtual_tokens,
+        total_tokens=11 + num_virtual_tokens)
    # test using token IDs
    completion = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=[0, 0, 0, 0, 0],
        max_tokens=5,
        temperature=0.0,
    )
-    assert len(completion.choices[0].text) >= 5
+    assert len(completion.choices[0].text) >= 1
+@pytest.mark.asyncio
+async def test_added_lora_tokens(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model="zephyr-lora2",
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should appear in tokenized prompt
+    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
+@pytest.mark.asyncio
+async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should not appear in tokenized prompt
+    assert "vllm" not in completion.choices[0].text
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # first test base model, then test loras
+    # first test base model, then test loras, then test prompt adapters
    "model_name",
-    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"],
 )
 async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
    # test using token IDs
    completion = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=[0, 0, 0, 0, 0],
        max_tokens=5,
        temperature=0.0,
@@ -110,14 +179,14 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # just test 1 lora hereafter
+    # just test 1 lora and 1 pa hereafter
    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
 async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
    # test using token IDs
    completion = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=[0, 0, 0, 0, 0],
        max_tokens=5,
        temperature=0.0,
@@ -133,12 +202,12 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
 async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
    # test using token IDs
    completion = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=[0, 0, 0, 0, 0],
        max_tokens=5,
        temperature=0.0,
@@ -154,7 +223,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
 async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
                                            model_name: str):
@@ -162,7 +231,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
    with pytest.raises(
        (openai.BadRequestError, openai.APIError)):  # test using token IDs
        await client.completions.create(
-            model=MODEL_NAME,
+            model=model_name,
            prompt=[0, 0, 0, 0, 0],
            max_tokens=5,
            temperature=0.0,
@@ -174,7 +243,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
    with pytest.raises(
        (openai.BadRequestError, openai.APIError)):  # test using token IDs
        stream = await client.completions.create(
-            model=MODEL_NAME,
+            model=model_name,
            prompt=[0, 0, 0, 0, 0],
            max_tokens=5,
            temperature=0.0,
@@ -199,7 +268,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
 async def test_completion_streaming(client: openai.AsyncOpenAI,
                                    model_name: str):
@@ -233,7 +302,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
 async def test_completion_stream_options(client: openai.AsyncOpenAI,
                                         model_name: str):
@@ -369,9 +438,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    # just test 1 lora hereafter
    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
+    [MODEL_NAME, "zephyr-lora", "zephyr-pa"],
 )
 async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
    # test both text and token IDs
@@ -614,51 +682,3 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
            prompt="Give an example string that fits this regex",
            extra_body=dict(guided_regex=sample_regex,
                            guided_json=sample_json_schema))
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_tokenize(client: openai.AsyncOpenAI, model_name: str):
-    base_url = str(client.base_url)[:-3].strip("/")
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
-    for add_special in [False, True]:
-        prompt = "This is a test prompt."
-        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
-        response = requests.post(base_url + "/tokenize",
-                                 json={
-                                     "add_special_tokens": add_special,
-                                     "model": model_name,
-                                     "prompt": prompt
-                                 })
-        response.raise_for_status()
-        assert response.json() == {
-            "tokens": tokens,
-            "count": len(tokens),
-            "max_model_len": 8192
-        }
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
-    base_url = str(client.base_url)[:-3]
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast")
-    prompt = "This is a test prompt."
-    tokens = tokenizer.encode(prompt, add_special_tokens=False)
-    response = requests.post(base_url + "detokenize",
-                             json={
-                                 "model": model_name,
-                                 "tokens": tokens
-                             })
-    response.raise_for_status()
-    assert response.json() == {"prompt": prompt}
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,17 +11,17 @@ EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 @pytest.fixture(scope="module")
 def embedding_server():
-    with RemoteOpenAIServer([
+    args = [
-            "--model",
+        # use half precision for speed and memory savings in CI environment
-            EMBEDDING_MODEL_NAME,
+        "--dtype",
-            # use half precision for speed and memory savings in CI environment
+        "bfloat16",
-            "--dtype",
+        "--enforce-eager",
-            "bfloat16",
+        "--max-model-len",
-            "--enforce-eager",
+        "8192",
-            "--max-model-len",
+        "--enforce-eager",
-            "8192",
+    ]
-            "--enforce-eager",
-    ]) as remote_server:
+    with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -19,27 +19,27 @@ def zephyr_lora_files():
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files):
-    with RemoteOpenAIServer([
+    args = [
-            "--model",
+        # use half precision for speed and memory savings in CI environment
-            MODEL_NAME,
+        "--dtype",
-            # use half precision for speed and memory savings in CI environment
+        "bfloat16",
-            "--dtype",
+        "--max-model-len",
-            "bfloat16",
+        "8192",
-            "--max-model-len",
+        "--enforce-eager",
-            "8192",
+        # lora config below
-            "--enforce-eager",
+        "--enable-lora",
-            # lora config below
+        "--lora-modules",
-            "--enable-lora",
+        f"zephyr-lora={zephyr_lora_files}",
-            "--lora-modules",
+        f"zephyr-lora2={zephyr_lora_files}",
-            f"zephyr-lora={zephyr_lora_files}",
+        "--max-lora-rank",
-            f"zephyr-lora2={zephyr_lora_files}",
+        "64",
-            "--max-lora-rank",
+        "--max-cpu-loras",
-            "64",
+        "2",
-            "--max-cpu-loras",
+        "--max-num-seqs",
-            "2",
+        "128",
-            "--max-num-seqs",
+    ]
-            "128",
-    ]) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -32,11 +32,13 @@ async def _async_serving_chat_init():
                                           model_config,
                                           served_model_names=[MODEL_NAME],
                                           response_role="assistant",
-                                           chat_template=CHAT_TEMPLATE)
+                                           chat_template=CHAT_TEMPLATE,
+                                           lora_modules=None,
+                                           prompt_adapters=None,
+                                           request_logger=None)
    return serving_completion
 def test_async_serving_chat_init():
    serving_completion = asyncio.run(_async_serving_chat_init())
-    assert serving_completion.tokenizer is not None
+    assert serving_completion.chat_template == CHAT_TEMPLATE
-    assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
+import openai  # use the official client for correctness check
+import pytest
+import requests
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+@pytest.fixture(scope="module")
+def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "128",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+        "--max-lora-rank",
+        "64",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+@pytest.fixture(scope="module")
+def tokenizer_name(model_name: str,
+                   zephyr_lora_added_tokens_files: str):  # noqa: F811
+    return zephyr_lora_added_tokens_files if (
+        model_name == "zephyr-lora2") else model_name
+@pytest.fixture(scope="module")
+def client(server):
+    return server.get_async_client()
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_completions(client: openai.AsyncOpenAI,
+                                    model_name: str, tokenizer_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+    for add_special in [False, True]:
+        prompt = "vllm1 This is a test prompt."
+        tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+        response = requests.post(base_url + "/tokenize",
+                                 json={
+                                     "add_special_tokens": add_special,
+                                     "model": model_name,
+                                     "prompt": prompt
+                                 })
+        response.raise_for_status()
+        assert response.json() == {
+            "tokens": tokens,
+            "count": len(tokens),
+            "max_model_len": 8192
+        }
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
+                             tokenizer_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+    for add_generation in [False, True]:
+        for add_special in [False, True]:
+            conversation = [{
+                "role": "user",
+                "content": "Hi there!"
+            }, {
+                "role": "assistant",
+                "content": "Nice to meet you!"
+            }, {
+                "role": "user",
+                "content": "Can I ask a question? vllm1"
+            }]
+            prompt = tokenizer.apply_chat_template(
+                add_generation_prompt=add_generation,
+                conversation=conversation,
+                tokenize=False)
+            tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
+            response = requests.post(base_url + "/tokenize",
+                                     json={
+                                         "add_generation_prompt":
+                                         add_generation,
+                                         "add_special_tokens": add_special,
+                                         "messages": conversation,
+                                         "model": model_name
+                                     })
+            response.raise_for_status()
+            assert response.json() == {
+                "tokens": tokens,
+                "count": len(tokens),
+                "max_model_len": 8192
+            }
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
+                          tokenizer_name: str):
+    base_url = str(client.base_url)[:-3].strip("/")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
+    prompt = "This is a test prompt. vllm1"
+    tokens = tokenizer.encode(prompt, add_special_tokens=False)
+    print(f"CALLING {base_url} FOR {model_name}")
+    response = requests.post(base_url + "/detokenize",
+                             json={
+                                 "model": model_name,
+                                 "tokens": tokens
+                             })
+    response.raise_for_status()
+    assert response.json() == {"prompt": prompt}
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -2,9 +2,8 @@ from typing import Dict, List
 import openai
 import pytest
-import pytest_asyncio
-from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
+from vllm.multimodal.utils import encode_image_base64, fetch_image
 from ...utils import VLLM_PATH, RemoteOpenAIServer
@@ -23,17 +22,17 @@ TEST_IMAGE_URLS = [
 @pytest.fixture(scope="module")
 def server():
-    with RemoteOpenAIServer([
+    args = [
-            "--model",
+        "--dtype",
-            MODEL_NAME,
+        "bfloat16",
-            "--dtype",
+        "--max-model-len",
-            "bfloat16",
+        "4096",
-            "--max-model-len",
+        "--enforce-eager",
-            "4096",
+        "--chat-template",
-            "--enforce-eager",
+        str(LLAVA_CHAT_TEMPLATE),
-            "--chat-template",
+    ]
-            str(LLAVA_CHAT_TEMPLATE),
-    ]) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
@@ -42,11 +41,10 @@ def client(server):
    return server.get_async_client()
-@pytest_asyncio.fixture(scope="session")
+@pytest.fixture(scope="session")
-async def base64_encoded_image() -> Dict[str, str]:
+def base64_encoded_image() -> Dict[str, str]:
    return {
-        image_url:
+        image_url: encode_image_base64(fetch_image(image_url))
-        encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
        for image_url in TEST_IMAGE_URLS
    }

--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
+from typing import Optional, Tuple, Union
+import torch
+def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+    return torch.as_tensor(x, dtype=torch.float32, device='cuda')
+def ref_dynamic_per_token_quant(x: torch.tensor,
+                                quant_dtype: torch.dtype,
+                                scale_ub: Optional[torch.tensor] = None) \
+        -> Tuple[torch.tensor, torch.tensor]:
+    assert quant_dtype in [torch.int8, torch.float8_e4m3fn]
+    if scale_ub is not None:
+        assert quant_dtype == torch.float8_e4m3fn
+    qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \
+            else torch.finfo(quant_dtype)
+    qtype_max = as_float32_tensor(qtype_traits.max)
+    s_1 = as_float32_tensor(1.0)
+    s_512 = as_float32_tensor(512.0)
+    # For fp8, in order to match the cuda kernel output, we have to do exactly
+    # the same operations as in the corresponding fp8 kernel to prevent
+    # rounding errors.
+    # Compute scales
+    x_token_max, _ = x.abs().max(dim=-1)
+    x_token_max = as_float32_tensor(x_token_max)
+    if scale_ub is not None:
+        x_token_max = x_token_max.clamp(max=scale_ub)
+    scales = (x_token_max / qtype_max)[:, None]
+    # Quant
+    if quant_dtype == torch.int8:
+        iscales = as_float32_tensor(s_1 / scales)
+        torch_out = as_float32_tensor(x) * iscales
+        torch_out = torch_out.round()
+        torch_out = torch_out.clamp(qtype_traits.min,
+                                    qtype_traits.max).to(quant_dtype)
+    else:
+        assert quant_dtype == torch.float8_e4m3fn
+        min_scaling_factor = s_1 / (qtype_max * s_512)
+        scales = scales.clamp(min=min_scaling_factor)
+        torch_out = as_float32_tensor(x) / scales
+        torch_out = torch_out.clamp(qtype_traits.min,
+                                    qtype_traits.max).to(quant_dtype)
+    return torch_out, scales
+# The int8 version is very similar. Incorporate the int8 version, like in
+# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
+# kernel
+def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
+                    -> Tuple[torch.tensor, torch.tensor]:
+    fp8_traits = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = as_float32_tensor(fp8_traits.max)
+    one = as_float32_tensor(1.0)
+    # For fp8, in order to match the cuda kernel output, we have to do exactly
+    # the same operations as in the corresponding fp8 kernel to prevent
+    # rounding errors.
+    x_max = as_float32_tensor(x.abs().max())
+    ref_scale = x_max / fp8_max
+    ref_iscale = one / ref_scale
+    ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
+        fp8_traits.min, fp8_traits.max).to(dtype=torch.float8_e4m3fn)
+    return ref_out, ref_scale
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -176,7 +176,7 @@ def test_paged_attention(
    key_cache, value_cache = key_caches[0], value_caches[0]
    # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
    # Call the paged attention kernel.
    output = torch.empty_like(query)
@@ -194,7 +194,8 @@ def test_paged_attention(
            max_seq_len,
            alibi_slopes,
            kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
        )
    elif version == "v2":
        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
@@ -225,7 +226,8 @@ def test_paged_attention(
            max_seq_len,
            alibi_slopes,
            kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
        )
    else:
        raise AssertionError(f"Unknown version: {version}")

--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -212,7 +212,7 @@ def test_paged_attention(
    key_cache, value_cache = key_caches[0], value_caches[0]
    # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
    tp_rank = 0
    # Call the paged attention kernel.
@@ -231,7 +231,8 @@ def test_paged_attention(
            max_seq_len,
            alibi_slopes,
            kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
            tp_rank=tp_rank,
            blocksparse_local_blocks=blocksparse_local_blocks,
            blocksparse_vert_stride=blocksparse_vert_stride,
@@ -267,7 +268,8 @@ def test_paged_attention(
            max_seq_len,
            alibi_slopes,
            kv_cache_dtype,
-            kv_scale,
+            k_scale,
+            v_scale,
            tp_rank=tp_rank,
            blocksparse_local_blocks=blocksparse_local_blocks,
            blocksparse_vert_stride=blocksparse_vert_stride,

--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -156,11 +156,11 @@ def test_reshape_and_cache(
        cloned_value_cache = value_cache.clone()
    # Using default kv_scale
-    kv_scale = 1.0
+    k_scale = v_scale = 1.0
    # Call the reshape_and_cache kernel.
    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
-                          kv_cache_dtype, kv_scale)
+                          kv_cache_dtype, k_scale, v_scale)
    if kv_cache_dtype == "fp8":
        result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)

--- a/tests/kernels/test_fp8_quant.py
+++ b/tests/kernels/test_fp8_quant.py
+import pytest
+import torch
+import vllm._custom_ops as ops
+from tests.kernels.quant_utils import (ref_dynamic_per_tensor_fp8_quant,
+                                       ref_dynamic_per_token_quant)
+DTYPES = [torch.half, torch.bfloat16, torch.float]
+HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
+                8193]  # Arbitrary values for testing
+HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
+NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
+SCALE_UBS = [True, False]
+SEEDS = [0]
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("scale_ub", SCALE_UBS)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int,
+                                     dtype: torch.dtype, scale_ub: bool,
+                                     seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype,
+                   device="cuda") + 1e-6  # avoid nans
+    scale_ub = torch.mean(x).to(dtype=torch.float32, device='cuda') \
+            if scale_ub else None
+    ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.float8_e4m3fn,
+                                                      scale_ub)
+    ops_out, ops_scales = ops.scaled_fp8_quant(x,
+                                               scale_ub=scale_ub,
+                                               use_per_token_if_dynamic=True)
+    assert torch.allclose(ref_scales, ops_scales)
+    assert torch.allclose(ref_out.to(dtype=torch.float32),
+                          ops_out.to(dtype=torch.float32))
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int,
+                                      dtype: torch.dtype, seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    ref_out, ref_scale = ref_dynamic_per_tensor_fp8_quant(x)
+    ops_out, ops_scale = ops.scaled_fp8_quant(x)
+    assert torch.allclose(ref_scale, ops_scale)
+    assert torch.allclose(ref_out.to(dtype=torch.float32),
+                          ops_out.to(dtype=torch.float32))
+# Regression test for a case with large activations where an int32 index cannot
+# represent the number of elements.
+@torch.inference_mode()
+@pytest.mark.parametrize("seed", SEEDS)
+def test_fp8_quant_large(seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    num_tokens = 1024000  # Mistral-Nemo's max_position_embeddings
+    hidden_size = 1152  # Smallest hidden_size to reproduce the error
+    dtype = torch.bfloat16
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    ref_out, scale = ref_dynamic_per_tensor_fp8_quant(x)
+    ops_out, _ = ops.scaled_fp8_quant(x, scale)
+    # Minimize memory footprint in this test by freeing x and upconverting
+    # the outputs in place. (torch.allclose does not support fp8)
+    del x
+    ref_out = ref_out.to(dtype=dtype)
+    ops_out = ops_out.to(dtype=dtype)
+    assert torch.allclose(ref_out, ops_out)
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -3,6 +3,8 @@ import torch
 # ruff: noqa: F401
 import vllm._C
+from tests.kernels.quant_utils import ref_dynamic_per_token_quant
+from vllm._custom_ops import scaled_int8_quant
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
@@ -21,23 +23,16 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
                                   dtype: torch.dtype, seed: int) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
-    int8_traits = torch.iinfo(torch.int8)
    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
-    x_token_max, _ = x.max(dim=1)
+    # reference
-    x_token_max = x_token_max.to(dtype=torch.float32)
+    ref_out, ref_scales = ref_dynamic_per_token_quant(x, torch.int8)
-    scales = (x_token_max / float(127.0))[:, None].to(device="cuda",
+    # kernel
-                                                      dtype=torch.float32)
+    ops_out, ops_scales = scaled_int8_quant(x)
-    torch_out = (x / scales).round().clamp(int8_traits.min,
-                                           int8_traits.max).to(torch.int8)
-    ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
-    scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
-    torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out)
-    assert torch.allclose(scales_out, scales)
+    assert torch.allclose(ops_scales, ref_scales)
-    assert torch.allclose(torch_out, ops_out,
+    assert torch.allclose(ops_out, ref_out,
                          atol=1)  # big atol to account for rounding errors
@@ -55,12 +50,11 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
    int8_traits = torch.iinfo(torch.int8)
    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
+    scale = torch.tensor([scale], dtype=torch.float32, device="cuda")
    out1 = (x / scale).round().clamp(int8_traits.min,
                                     int8_traits.max).to(torch.int8)
-    out2 = torch.empty_like(x, dtype=torch.int8)
+    out2, _ = scaled_int8_quant(x, scale)
-    scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
-    torch.ops._C.static_scaled_int8_quant(out2, x, scale_argument)
    assert torch.allclose(out1, out2,
                          atol=1)  # big atol to account for rounding errors
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -12,17 +12,18 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS,
+    MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS,
-    marlin_permute_scales)
+    marlin_make_empty_g_idx, marlin_permute_scales)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
    pack_fp8_to_int32)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
-    MarlinWorkspace, get_weight_perm, marlin_quantize, marlin_weights)
+    MarlinWorkspace, awq_marlin_quantize, get_weight_perm, marlin_quantize,
+    marlin_weights)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
    marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, quantize_weights, sort_weights)
+    awq_pack, gptq_pack, quantize_weights, quantize_weights_with_zp,
-from vllm.utils import is_hip
+    sort_weights)
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
@@ -58,12 +59,12 @@ def rand_data(shape, dtype=torch.float16):
                    reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
-@pytest.mark.parametrize("group_size", GPTQ_MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
+def test_gptq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
-                       mnk_factors):
+                            mnk_factors):
    m_factor, n_factor, k_factor = mnk_factors
    size_m = m_factor
@@ -121,12 +122,60 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
                    reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("num_bits", GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
-@pytest.mark.parametrize("group_size", GPTQ_MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size,
+                           mnk_factors):
+    m_factor, n_factor, k_factor = mnk_factors
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+    print(f"MNK = {size_m} {size_n} {size_k}")
+    # Normalize group_size
+    if group_size == -1:
+        group_size = size_k
+    assert group_size <= size_k
+    # Create input
+    b_weight = rand_data((size_k, size_n))
+    # Quantize
+    w_ref, q_w, s, zp = quantize_weights_with_zp(b_weight, num_bits,
+                                                 group_size)
+    # Pack to AWQ format
+    q_w_awq = awq_pack(q_w, num_bits, size_k, size_n)
+    # Pack to Marlin format
+    weight_perm = get_weight_perm(num_bits)
+    marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+    # Run Marlin repack GPU kernel
+    marlin_q_w_2 = ops.awq_marlin_repack(
+        q_w_awq,
+        size_k,
+        size_n,
+        num_bits,
+    )
+    torch.cuda.synchronize()
+    assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
-def test_marlin_gemm(
+def test_gptq_marlin_gemm(
    k_chunk,
    n_chunk,
    num_bits,
@@ -156,6 +205,8 @@ def test_marlin_gemm(
    w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
        b_weight, num_bits, group_size, act_order)
+    marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                GPTQ_MARLIN_MAX_PARALLEL)
@@ -163,6 +214,7 @@ def test_marlin_gemm(
        a_input,
        marlin_q_w,
        marlin_s,
+        marlin_zp,
        g_idx,
        sort_indices,
        workspace.scratch,
@@ -171,6 +223,7 @@ def test_marlin_gemm(
        b_weight.shape[1],
        a_input.shape[1],
        is_k_full,
+        has_zp=False,
    )
    output_ref = torch.matmul(a_input, w_ref)
@@ -189,7 +242,8 @@ def test_marlin_gemm(
 @pytest.mark.parametrize("num_bits", GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
 @pytest.mark.parametrize("group_size", GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size, mnk_factors):
+def test_gptq_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size,
+                             mnk_factors):
    m_factor, n_factor, k_factor = mnk_factors
    size_m = m_factor
@@ -302,3 +356,65 @@ def test_fp8_marlin_gemm(
    print("max_diff = {}".format(max_diff))
    assert max_diff < 0.04
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
+@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
+@pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS)
+@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+def test_awq_marlin_gemm(
+    k_chunk,
+    n_chunk,
+    num_bits,
+    group_size,
+    mnk_factors,
+):
+    m_factor, n_factor, k_factor = mnk_factors
+    size_m = m_factor
+    size_k = k_chunk * k_factor
+    size_n = n_chunk * n_factor
+    print(f"MNK = {size_m} {size_n} {size_k}")
+    print(f"groupsize = {group_size}")
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+    w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
+        b_weight, num_bits, group_size)
+    g_idx = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
+    sort_indices = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
+    is_k_full = True
+    has_zp = True
+    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        marlin_q_w,
+        marlin_s,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace.scratch,
+        num_bits,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full,
+        has_zp,
+    )
+    output_ref = torch.matmul(a_input, w_ref)
+    torch.cuda.synchronize()
+    max_diff = compute_max_diff(output, output_ref)
+    print("max_diff = {}".format(max_diff))
+    assert max_diff < 0.04
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -159,8 +159,14 @@ def dummy_model_gate_up() -> nn.Module:
 @pytest.fixture(scope="session")
-def sql_lora_files():
+def sql_lora_huggingface_id():
-    return snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    # huggingface repo id is used to test lora runtime downloading.
+    return "yard1/llama-2-7b-sql-lora-test"
+@pytest.fixture(scope="session")
+def sql_lora_files(sql_lora_huggingface_id):
+    return snapshot_download(repo_id=sql_lora_huggingface_id)
 @pytest.fixture(scope="session")

--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -29,7 +29,7 @@ def _create_lora_request(lora_id, long_context_infos):
    context_len = long_context_infos[lora_id]["context_length"]
    scaling_factor = context_len_to_scaling_factor[context_len]
    return LoRARequest(context_len, lora_id,
-                       long_context_infos[lora_id]["lora"],
+                       long_context_infos[lora_id]["lora"], None,
                       4096 * scaling_factor)