Merge tag 'v0.8.3' into v0.8.3-dev

fcfc474d · zhuwenwen · bb94d2e5 · 296c6572 · fcfc474d · fcfc474d
Commit fcfc474d authored Apr 09, 2025 by zhuwenwen
20 changed files
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -63,7 +63,8 @@ class LlamaConfig:
            factors.append((k, v))
        factors.sort()
        import hashlib
-        return hashlib.md5(str(factors).encode()).hexdigest()
+        return hashlib.md5(str(factors).encode(),
+                           usedforsecurity=False).hexdigest()

    def __post_init__(self):
        assert self.mlp_size >= self.hidden_size

--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -2,21 +2,20 @@

 from __future__ import annotations

-from typing import Any
+from typing import Any, Union

 import pytest
 import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
+from vllm.config import CompilationConfig, CompilationLevel
 from vllm.platforms import current_platform

 from ..utils import create_new_process_for_each_test


-@pytest.fixture(params=None, name="model_info")
-def models_list_fixture(request):
+def models_list(all: bool):
    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
        ("facebook/opt-125m", {}),
        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -33,6 +32,9 @@ def models_list_fixture(request):
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]

+    if not all:
+        return TEST_MODELS
+
    if is_quant_method_supported("aqlm"):
        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
            "quantization": "aqlm"
@@ -77,7 +79,7 @@ def models_list_fixture(request):
    "optimization_level",
    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
 )
-@pytest.mark.parametrize("model_info", "", indirect=True)
+@pytest.mark.parametrize("model_info", models_list(all=True))
 @create_new_process_for_each_test()
 def test_full_graph(
    monkeypatch: pytest.MonkeyPatch,
@@ -91,25 +93,50 @@ def test_full_graph(
        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
        print(f"MODEL={model}")

-        prompts = [
-            "Hello, my name is",
-            "The president of the United States is",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-        sampling_params = SamplingParams(temperature=0)
-        llm = LLM(
-            model=model,
-            enforce_eager=True,
-            tensor_parallel_size=1,
-            disable_custom_all_reduce=True,
-            compilation_config=optimization_level,
-            **model_kwargs,
-        )
-        outputs = llm.generate(prompts, sampling_params)
-
-        # Print the outputs.
-        for output in outputs:
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        run_model(optimization_level, model, model_kwargs)
+
+
+# TODO(luka) add other supported compilation config scenarios here
+@pytest.mark.parametrize(
+    "compilation_config",
+    # additional compile sizes
+    [
+        CompilationConfig(level=CompilationLevel.PIECEWISE,
+                          compile_sizes=[1, 2])
+    ])
+# only test some of the models
+@pytest.mark.parametrize("model_info", models_list(all=False))
+@create_new_process_for_each_test()
+def test_custom_compile_config(
+    model_info: tuple[str, dict[str, Any]],
+    compilation_config: CompilationConfig,
+):
+    model, model_kwargs = model_info
+    print(f"MODEL={model}")
+    run_model(compilation_config, model, model_kwargs)
+
+
+def run_model(compile_config: Union[int, CompilationConfig], model: str,
+              model_kwargs: dict[str, Any]):
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(
+        model=model,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_custom_all_reduce=True,
+        compilation_config=compile_config,
+        **model_kwargs,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -2,7 +2,6 @@

 import pytest
 import torch
-from compressed_tensors.quantization import FP8_DTYPE

 import vllm.envs as envs
 import vllm.plugins
@@ -14,9 +13,12 @@ from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+from vllm.platforms import current_platform

 from .backend import TestBackend

+FP8_DTYPE = current_platform.fp8_dtype()
+

 class TestModel(torch.nn.Module):

@@ -59,8 +61,8 @@ class TestModel(torch.nn.Module):
 @pytest.mark.parametrize("static", [True, False])
 @pytest.mark.parametrize("cutlass_fp8_enabled",
                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
-                    reason="Only test on CUDA")
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
+                    reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
                              cutlass_fp8_enabled):
    torch.set_default_device("cuda")

--- a/tests/data/test_config.yaml
+++ b/tests/data/test_config.yaml
--- a/tests/config/test_config_with_model.yaml
+++ b/tests/config/test_config_with_model.yaml
+# Same as test_config.yaml but with model specified
+model: config-model
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -752,30 +752,27 @@ class VllmRunner:
        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
    ) -> list[TextPrompt]:
-        if images is not None:
-            assert len(prompts) == len(images)
-
-        if videos is not None:
-            assert len(prompts) == len(videos)

-        if audios is not None:
-            assert len(prompts) == len(audios)
+        if any(x is not None and len(x) != len(prompts)
+               for x in [images, videos, audios]):
+            raise ValueError(
+                "All non-None multimodal inputs must have the same length as "
+                "prompts")

-        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
-        if images is not None:
-            for i, image in enumerate(images):
-                if image is not None:
-                    inputs[i]["multi_modal_data"] = {"image": image}
-
-        if videos is not None:
-            for i, video in enumerate(videos):
-                if video is not None:
-                    inputs[i]["multi_modal_data"] = {"video": video}
+        inputs = []
+        for i, prompt in enumerate(prompts):
+            multi_modal_data = {}
+            if images is not None and (image := images[i]) is not None:
+                multi_modal_data["image"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                multi_modal_data["video"] = video
+            if audios is not None and (audio := audios[i]) is not None:
+                multi_modal_data["audio"] = audio

-        if audios is not None:
-            for i, audio in enumerate(audios):
-                if audio is not None:
-                    inputs[i]["multi_modal_data"] = {"audio": audio}
+            inputs.append(
+                TextPrompt(prompt=prompt,
+                           multi_modal_data=multi_modal_data
+                           if multi_modal_data else None))

        return inputs

@@ -1145,3 +1142,15 @@ def pytest_collection_modifyitems(config, items):
    for item in items:
        if "optional" in item.keywords:
            item.add_marker(skip_optional)
+
+
+@pytest.fixture(scope="session")
+def cli_config_file():
+    """Return the path to the CLI config file."""
+    return os.path.join(_TEST_DIR, "config", "test_config.yaml")
+
+
+@pytest.fixture(scope="session")
+def cli_config_file_with_model():
+    """Return the path to the CLI config file with model."""
+    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -131,12 +131,16 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
    check_answers(indices, answer, test_texts)


-def prep_prompts(batch_size: int):
+def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
    """
    Generate prompts which a bunch of assignments,
    then asking for the value of one of them.
    The prompt is just under 10k tokens; sliding window is 4k
    so the answer is outside sliding window, but should still be correct.
+
+    Args:
+        batch_size: number of prompts to generate
+        ln_range: an argument to control the length of the prompt
    """
    prompts: list[str] = []
    answer: list[int] = []
@@ -147,7 +151,7 @@ def prep_prompts(batch_size: int):
        indices.append(idx)
        prompt = "```python\n# We set a number of variables, " + \
                 f"x{idx} will be important later\n"
-        ln = random.randint(800, 1100)
+        ln = random.randint(*ln_range)
        for k in range(30, ln):
            v = random.randint(10, 99)
            if k == idx:
@@ -159,7 +163,10 @@ def prep_prompts(batch_size: int):
    return prompts, answer, indices


-def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
+def check_answers(indices: list[int],
+                  answer: list[int],
+                  outputs: list[str],
+                  accept_rate: float = 0.7):
    answer2 = [int(text[0:2].strip()) for text in outputs]
    print(list(zip(indices, zip(answer, answer2))))
    numok = 0
@@ -168,7 +175,7 @@ def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
            numok += 1
    frac_ok = numok / len(answer)
    print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
-    assert frac_ok > 0.7
+    assert frac_ok >= accept_rate


 def check_window(prompts: list[str]):

--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -106,7 +106,6 @@ def eager_allreduce(
        # communicate independently
        num_communication = rank // tp_size + 1
        sz = 1024
-        # fa = get_tp_group().ca_comm
        fa = get_tp_group().device_communicator.ca_comm
        inp = torch.ones(sz, dtype=torch.float32, device=device)
        out = inp

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -245,7 +245,7 @@ TEST_MODELS = [
    # [LANGUAGE GENERATION]
    os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-    # "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
+    os.path.join(models_path_prefix, "ArthurZ/Ilama-3.2-1B"),
    os.path.join(models_path_prefix, "ibm/PowerLM-3b"),
    # [LANGUAGE EMBEDDING]
    os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),

--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -15,18 +15,24 @@ import pytest
 from vllm.platforms import current_platform
 from ...utils import models_path_prefix

-MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
+MODEL_NAMES = [
+    os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"),
+    os.path.join(models_path_prefix, "google/gemma-3-1b-it"),
+]
 NUM_CONCURRENT = 500
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
-EXPECTED_VALUE = 0.58
+EXPECTED_VALUES = {
+    "Qwen/Qwen2-1.5B-Instruct": 0.58,
+    "google/gemma-3-1b-it": 0.25,
+}


-def run_test(more_args=None):
+def run_test(model_name, more_args=None):
    """Run the end to end accuracy test."""

-    model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
+    model_args = f"pretrained={model_name},max_model_len=4096"

    if more_args is not None:
        model_args = "{},{}".format(model_args, more_args)
@@ -39,9 +45,12 @@ def run_test(more_args=None):
    )

    measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert model_name in EXPECTED_VALUES, (
+        f"Cannot find the expected value for the model {model_name=}")
+    expected_value = EXPECTED_VALUES[model_name]
+    assert (measured_value - RTOL < expected_value
+            and measured_value + RTOL > expected_value
+            ), f"Expected: {expected_value} |  Measured: {measured_value}"


 # TODO: [AlexM] Fix it with new CI/CD tests
@@ -51,7 +60,8 @@ TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
 @pytest.mark.skipif(not current_platform.is_cuda()
                    and not current_platform.is_tpu(),
                    reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("model", MODEL_NAMES)
+def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
    """Run with the V1 Engine."""

    with monkeypatch.context() as m:
@@ -60,13 +70,13 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
        more_args = None
        if current_platform.is_tpu():
            # Limit compilation time for TPU V1
-            more_args = "max_num_seqs=64"
+            more_args = "max_model_len=2048,max_num_seqs=64"

            # Add TP test (if provided)
            if TPU_TP_TEST_STR:
                more_args += ",{}".format(TPU_TP_TEST_STR)

-        run_test(more_args)
+        run_test(model, more_args)


 def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
@@ -74,4 +84,4 @@ def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "0")
-        run_test()
+        run_test("Qwen/Qwen2-1.5B-Instruct")
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -25,7 +25,19 @@ LORA_NAME = os.path.join(models_path_prefix, "typeof/zephyr-7b-beta-lora")


 @pytest.fixture(scope="module")
-def llm():
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def llm(request, monkeypatch_module):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,7 +7,6 @@ import weakref
 import jsonschema
 import pytest
 import os
-from pydantic import BaseModel

 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
@@ -18,7 +17,10 @@ from ...utils import models_path_prefix

 MODEL_NAME = os.path.join(models_path_prefix, "Qwen2.5-1.5B-Instruct")
 GUIDED_DECODING_BACKENDS = [
-    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+    "outlines",
+    "lm-format-enforcer",
+    "xgrammar:disable-any-whitespace",
+    "guidance:disable-any-whitespace",
 ]


@@ -325,59 +327,9 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
            print(generated_text)
            assert generated_text is not None

+            if 'disable-any-whitespace' in guided_decoding_backend:
+                assert "\n" not in generated_text
+
            # Parse to verify it is valid JSON
            parsed_json = json.loads(generated_text)
            assert isinstance(parsed_json, dict)
-
-
-@pytest.mark.skip_global_cleanup
-def test_json_with_any_whitespace_disabled(llm):
-
-    class ResponseSchema(BaseModel):
-        clarifying_question: str
-        cost_per_serving: str
-        calories: str
-        type_dish_ids: str
-        type_meal_ids: str
-        product_ids: list[str]
-        exclude_product_ids: list[str]
-        allergen_ids: list[str]
-        total_cooking_time: str
-        kitchen_ids: str
-        holiday_ids: str
-
-    # Note: Without this setting, the response is sometimes full of `\n`
-    # for some models. This option prevents that.
-    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
-
-    schema = ResponseSchema.model_json_schema()
-    guided_params = GuidedDecodingParams(json=schema,
-                                         backend=\
-                                           guided_decoding_backend)
-    sampling_params = SamplingParams(max_tokens=2000,
-                                     frequency_penalty=0,
-                                     presence_penalty=-1.1,
-                                     repetition_penalty=1.3,
-                                     guided_decoding=guided_params)
-
-    prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You"
-              "are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a "
-              "quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n")
-    outputs = llm.generate(prompts=prompt,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        assert "\n" not in generated_text
-
-        # Parse to verify it is valid JSON
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
-        jsonschema.validate(instance=parsed_json, schema=schema)
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -12,7 +12,7 @@ import os
 import pytest_asyncio
 import requests
 import torch
-from openai import BadRequestError
+from openai import BadRequestError, OpenAI

 from ...utils import RemoteOpenAIServer, models_path_prefix
 from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
@@ -25,7 +25,23 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]


 @pytest.fixture(scope="module")
-def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server(
+        request,
+        monkeypatch_module,
+        zephyr_lora_files,  #noqa: F811
+        zephyr_lora_added_tokens_files):  # noqa: F811
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@@ -50,6 +66,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
        yield remote_server


+@pytest.fixture
+def is_v1_server(server):
+    import os
+    assert os.environ['VLLM_USE_V1'] in ['0', '1']
+    return os.environ['VLLM_USE_V1'] == '1'
+
+
 @pytest_asyncio.fixture
 async def client(server):
    async with server.get_async_client() as async_client:
@@ -476,8 +499,13 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  is_v1_server: bool,
                                  guided_decoding_backend: str,
                                  sample_guided_choice):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -516,9 +544,13 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,

 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_chat(client: openai.AsyncOpenAI,
+async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
                                guided_decoding_backend: str,
                                sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported in V1")
+
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -564,7 +596,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_chat(client: openai.AsyncOpenAI,
+                                 is_v1_server: bool,
                                 guided_decoding_backend: str, sample_regex):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -622,8 +659,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
+                                           is_v1_server: bool,
                                           guided_decoding_backend: str,
                                           sample_guided_choice):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -653,9 +695,13 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,

 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_named_tool_use(client: openai.AsyncOpenAI,
+async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                              guided_decoding_backend: str,
                              sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported on V1")
+
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -745,53 +791,140 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,


 @pytest.mark.asyncio
-async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
-                                                   sample_json_schema):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_required_tool_use(client: openai.AsyncOpenAI,
+                                 is_v1_server: bool, model_name: str):
+    if is_v1_server:
+        pytest.skip(
+            "tool_choice='required' requires features unsupported on V1")
+
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to find the weather for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "unit"],
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "get_forecast",
+                "description": "Get the weather forecast for a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to get the forecast for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "days": {
+                            "type":
+                            "integer",
+                            "description":
+                            "Number of days to get the forecast for (1-7)",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "days", "unit"],
+                },
+            },
+        },
+    ]

-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice="required")
+    messages = [
+        {
+            "role": "user",
+            "content": "Hi! How are you doing today?"
+        },
+        {
+            "role": "assistant",
+            "content": "I'm doing well! How can I help you?"
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "Can you tell me what the current weather is in Berlin and the "\
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]

-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice="auto")
+    # Non-streaming test
+    chat_completion = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+        extra_body=dict(guided_decoding_backend="outlines"),
+    )
+
+    assert chat_completion.choices[0].message.tool_calls is not None
+    assert len(chat_completion.choices[0].message.tool_calls) > 0
+
+    # Streaming test
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+        extra_body=dict(guided_decoding_backend="outlines"),
+        stream=True,
+    )
+
+    output = []
+    async for chunk in stream:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            output.extend(chunk.choices[0].delta.tool_calls)
+
+    assert len(output) > 0


 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+                                                  is_v1_server: bool,
                                                  sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported on V1")
+
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -1005,7 +1138,7 @@ async def test_long_seed(client: openai.AsyncOpenAI):


 @pytest.mark.asyncio
-async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
+async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
    url = f"http://localhost:{server.port}/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
@@ -1026,10 +1159,35 @@ async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
    response = requests.post(url, headers=headers, json=data)
    response_data = response.json()
    print(response_data)
-
+    assert response_data.get("model") == MODEL_NAME
    choice = response_data.get("choices")[0]
    message = choice.get("message")
    assert message is not None
    content = message.get("content")
    assert content is not None
    assert len(content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME, ""])
+async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer,
+                                                   model_name: str):
+
+    openai_api_key = "EMPTY"
+    openai_api_base = f"http://localhost:{server.port}/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": "Hello, vLLM!"
+        },
+    ]
+    response = client.chat.completions.create(
+        model="",  # empty string
+        messages=messages,
+    )
+    assert response.model == MODEL_NAME
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -53,7 +53,20 @@ def zephyr_lora_files():


 @pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server_with_lora_modules_json(request, monkeypatch_module,
+                                  zephyr_lora_files):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
    # Define the json format LoRA module configurations
    lora_module_1 = {
        "name": "zephyr-lora",

--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -14,9 +14,12 @@ import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer

+from vllm import version
+
 from ...utils import RemoteOpenAIServer, models_path_prefix

 MODEL_NAME = os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+PREV_MINOR_VERSION = version._prev_minor_version()


 @pytest.fixture(scope="module", params=[True, False])
@@ -56,6 +59,7 @@ def default_server_args():
                    "",
                    "--enable-chunked-prefill",
                    "--disable-frontend-multiprocessing",
+                    f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
                ])
 def server(use_v1, default_server_args, request):
    if request.param:
@@ -130,7 +134,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,

    # Loop over all expected metric_families
    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+        if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
+                or (not server.show_hidden_metrics
+                    and metric_family in HIDDEN_DEPRECATED_METRICS)):
            continue

        found_metric = False
@@ -166,10 +172,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,

 EXPECTED_METRICS = [
    "vllm:num_requests_running",
-    "vllm:num_requests_swapped",
+    "vllm:num_requests_swapped",  # deprecated
    "vllm:num_requests_waiting",
    "vllm:gpu_cache_usage_perc",
-    "vllm:cpu_cache_usage_perc",
+    "vllm:cpu_cache_usage_perc",  # deprecated
    "vllm:time_to_first_token_seconds_sum",
    "vllm:time_to_first_token_seconds_bucket",
    "vllm:time_to_first_token_seconds_count",
@@ -269,6 +275,11 @@ EXPECTED_METRICS_V1 = [
    "vllm:request_decode_time_seconds_count",
 ]

+HIDDEN_DEPRECATED_METRICS = [
+    "vllm:num_requests_swapped",
+    "vllm:cpu_cache_usage_perc",
+]
+

 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
@@ -283,7 +294,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
    assert response.status_code == HTTPStatus.OK

    for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        assert metric in response.text
+        if (not server.show_hidden_metrics
+                and metric not in HIDDEN_DEPRECATED_METRICS):
+            assert metric in response.text


 def test_metrics_exist_run_batch(use_v1: bool):

--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -25,15 +25,37 @@ def test_sleep_mode():
                                "VLLM_SERVER_DEV_MODE": "1",
                                "CUDA_VISIBLE_DEVICES": "0"
                            }) as remote_server:
-        response = requests.post(remote_server.url_for("/sleep"),
-                                 data={"level": "1"})
+        response = requests.post(remote_server.url_for("sleep"),
+                                 params={"level": "1"})
        assert response.status_code == 200
-        response = requests.get(remote_server.url_for("/is_sleeping"))
+        response = requests.get(remote_server.url_for("is_sleeping"))
        assert response.status_code == 200
        assert response.json().get("is_sleeping") is True

-        response = requests.post(remote_server.url_for("/wake_up"))
+        response = requests.post(remote_server.url_for("wake_up"))
        assert response.status_code == 200
-        response = requests.get(remote_server.url_for("/is_sleeping"))
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
+
+        # test wake up with tags
+        response = requests.post(remote_server.url_for("sleep"),
+                                 params={"level": "1"})
+        assert response.status_code == 200
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["weights"]})
+        assert response.status_code == 200
+
+        # is sleeping should be false after waking up any part of the engine
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["kv_cache"]})
+        assert response.status_code == 200
+
+        response = requests.get(remote_server.url_for("is_sleeping"))
        assert response.status_code == 200
        assert response.json().get("is_sleeping") is False
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -4,6 +4,9 @@ import openai
 import pytest
 import os
 import pytest_asyncio
+import requests
+from PIL import Image
+from transformers import AutoProcessor

 from vllm.multimodal.utils import encode_image_base64, fetch_image

@@ -62,11 +65,31 @@ def base64_encoded_image() -> dict[str, str]:
    }


+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|>\n"
+    messages = [{
+        "role": "user",
+        "content": f"{placeholder}{content}",
+    }]
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+
+    prompt = processor.tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(prompt, images, return_tensors="pt")
+
+    return inputs.input_ids.shape[1]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                         model_name: str, image_url: str):
+    content_text = "What's in this image?"
    messages = [{
        "role":
        "user",
@@ -79,16 +102,17 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
            },
            {
                "type": "text",
-                "text": "What's in this image?"
+                "text": content_text
            },
        ],
    }]

+    max_completion_tokens = 10
    # test single completion
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=max_completion_tokens,
        logprobs=True,
        temperature=0.0,
        top_logprobs=5)
@@ -96,8 +120,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,

    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=774, total_tokens=784)
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)

    message = choice.message
    message = chat_completion.choices[0].message
@@ -159,6 +187,7 @@ async def test_single_chat_session_image_base64encoded(
        client: openai.AsyncOpenAI, model_name: str, image_url: str,
        base64_encoded_image: dict[str, str]):

+    content_text = "What's in this image?"
    messages = [{
        "role":
        "user",
@@ -172,16 +201,17 @@ async def test_single_chat_session_image_base64encoded(
            },
            {
                "type": "text",
-                "text": "What's in this image?"
+                "text": content_text
            },
        ],
    }]

+    max_completion_tokens = 10
    # test single completion
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=max_completion_tokens,
        logprobs=True,
        temperature=0.0,
        top_logprobs=5)
@@ -189,8 +219,12 @@ async def test_single_chat_session_image_base64encoded(

    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=774, total_tokens=784)
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)

    message = choice.message
    message = chat_completion.choices[0].message

--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -3,6 +3,8 @@
 import os
 import pytest
 import requests
+from PIL import Image
+from transformers import AutoProcessor

 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -60,11 +62,24 @@ def base64_encoded_image() -> dict[str, str]:
    }


+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|> "
+    prompt = f"{placeholder}{content}"
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    inputs = processor(prompt, images, return_tensors="pt")
+    return inputs.input_ids.shape[1]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
                               image_url: str):
+    content_text = "Represent the given image."
    messages = [{
        "role":
        "user",
@@ -77,7 +92,7 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
            },
            {
                "type": "text",
-                "text": "Represent the given image."
+                "text": content_text
            },
        ],
    }]
@@ -93,9 +108,12 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
    response.raise_for_status()
    embeddings = EmbeddingResponse.model_validate(response.json())

+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
+
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 3072
    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 763
-    assert embeddings.usage.total_tokens == 763
+    assert embeddings.usage.prompt_tokens == hf_prompt_tokens
+    assert embeddings.usage.total_tokens == hf_prompt_tokens
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -10,11 +10,11 @@ from transformers import __version__ as TRANSFORMERS_VERSION

 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
-                                         _try_extract_ast, load_chat_template,
+from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                         parse_chat_messages,
                                         parse_chat_messages_futures,
-                                         resolve_chat_template_content_format)
+                                         resolve_chat_template_content_format,
+                                         resolve_hf_chat_template)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
@@ -750,7 +750,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
    }] if use_tools else None

    # Test detecting the tokenizer's chat_template
-    chat_template = _resolve_hf_chat_template(
+    chat_template = resolve_hf_chat_template(
        tokenizer,
        chat_template=None,
        tools=tools,
@@ -784,7 +784,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
    tokenizer = tokenizer_group.tokenizer

    # Test detecting the tokenizer's chat_template
-    chat_template = _resolve_hf_chat_template(
+    chat_template = resolve_hf_chat_template(
        tokenizer,
        chat_template=None,
        tools=None,

--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -6,12 +6,25 @@ import itertools
 import pytest
 import torch

+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform

+dg_available = False
+try:
+    import deep_gemm
+    dg_available = True
+except ImportError:
+    pass
+
 if current_platform.get_device_capability() < (9, 0):
    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
                allow_module_level=True)
@@ -21,17 +34,18 @@ DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
 NUM_TOKENS = [7, 83, 2048]
 D = [512, 4096, 5120, 13824]
 GROUP_SIZE = [64, 128, 256, 512]
-M = [1, 7, 83, 512, 2048]
-N = [128, 512, 1024, 4096, 7748, 13824]
-K = [256, 4096, 5120, 3884, 13824]
+M = [1, 7, 8, 83, 84, 512, 2048, 4096]
+N = [128, 512, 1024, 4096, 7168, 7748, 13824]
+K = [256, 4096, 5120, 3884, 13824, 16384]
 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
 # and its hidden size is 7168.
-M_moe = [1, 7, 83, 512, 2048]
-N_moe = [4608]  # [128, 4608, 13824]
-K_moe = [7168]  # [256, 7168, 13824]
+M_moe = [1, 2, 7, 83, 128, 512, 2048]
+M_moe_dg = [128, 192, 512, 1335, 2048]
+N_moe = [128, 256, 1024, 4608]  # [13824]
+K_moe = [256, 512, 7168]  # [13824]
 BLOCK_SIZE = [[128, 128]]
-E = [8, 24]  # [8, 24, 128, 256]
-TOP_KS = [2]  # [1, 2, 6]
+E = [2, 8, 16, 24]  # [128, 256]
+TOP_KS = [1, 2, 6]
 OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
 SEEDS = [0]

@@ -217,11 +231,16 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
                      SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    if topk > E:
+        pytest.skip(f"Skipping test; topk={topk} > E={E}")
+
    torch.manual_seed(seed)
    factor_for_scale = 1e-2
    fp8_info = torch.finfo(torch.float8_e4m3fn)
    fp8_max, fp8_min = fp8_info.max, fp8_info.min

+    vllm_config = VllmConfig()
+
    a = torch.randn((M, K), dtype=dtype) / 10

    w1_bf16 = (torch.rand(
@@ -246,25 +265,240 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):

    score = torch.randn((M, E), dtype=dtype)

-    out = fused_moe(
-        a,
-        w1,
-        w2,
-        score,
-        topk,
-        renormalize=False,
-        use_fp8_w8a8=True,
-        w1_scale=w1_s,
-        w2_scale=w2_s,
-        block_shape=block_size,
-    )
-    ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
-                                       block_size)
-
-    print(f"{out.sum()=}")
-    print(f"{ref_out.sum()=}")
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_fp8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                           block_size)
+
+    #print(f"{out.sum()=}")
+    #print(f"{ref_out.sum()=}")
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.03
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (deep_gemm.ceil_div(m, 128) * 128,
+         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
+        dtype=x.dtype,
+        device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
+    # only aligned sizes
+    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = fp8_info.max
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+
+    _, block_k = block_size[0], block_size[1]
+
+    A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k)
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
+
+    As = As_fp8.to(torch.float32)
+    Bs = Bs_fp8.to(torch.float32)
+
+    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                           out_dtype)
+
+    # Transpose earlier so that the testing will not trigger transposing kernels
+    As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
+
+    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
+
+    assert As_fp8.shape == (M, (K + 127) //
+                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+
+    deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+def fp8_perm(m, idx):
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]
+
+
+def _moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
+    M, K = a.shape
+
+    sorted_token_ids, m_indices, num_pad = moe_align_block_size(
+        topk_ids, block_m, num_groups, None, pad_sorted_ids=True)
+
+    num_tokens = topk * M
+
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    m_indices = torch.repeat_interleave(m_indices, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:M * topk]
+
+    a = fp8_perm(a, sorted_token_ids // topk)
+    if a_s is not None:
+        a_s = a_s[sorted_token_ids // topk]
+
+    return a, a_s, m_indices, inv_perm
+
+
+def _moe_unpermute(out, inv_perm, topk, K, topk_weight):
+    M = topk_weight.shape[0]
+    out = out[inv_perm, ...]
+    tmp_out = out.view(-1, topk, K)
+    return (tmp_out * topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
+                                 block_shape):
+    """Fused moe with block-wise quantization using DeepGemm grouped gemm."""
+    num_groups = w1.shape[0]
+    M, K = a.shape
+    N = w2.shape[-1]
+
+    topk_weight, topk_ids = fused_topk(a, score.float(), topk, False)
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+
+    _, block_k = block_shape[0], block_shape[1]
+
+    a_q, a_s = per_token_group_quant_fp8(a, block_m)
+
+    a_q, a_s, m_indices, inv_perm = _moe_permute(a_q, a_s, topk_ids,
+                                                 num_groups, topk, block_m)
+
+    inter_out = torch.zeros((a_q.shape[0], N * 2),
+                            dtype=torch.bfloat16,
+                            device=a.device)
+
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a_q, a_s), (w1, w1_s),
+                                                        inter_out, m_indices)
+
+    act_out = SiluAndMul().forward_native(inter_out)
+    act_out_q, act_out_s = per_token_group_quant_fp8(act_out, block_k)
+
+    out = torch.zeros(a_q.shape[0], K, dtype=torch.bfloat16, device=a.device)
+
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+        (act_out_q, act_out_s), (w2, w2_s), out, m_indices)
+
+    final_out = _moe_unpermute(out, inv_perm, topk, K, topk_weight)
+
+    return final_out
+
+
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,seed",
+    itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS))
+@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+    dtype = torch.bfloat16
+
+    # only aligned sizes
+    if (N % block_m != 0 or K % block_m != 0 or topk > E):
+        pytest.skip(
+            f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}")
+
+    if N <= 512:
+        pytest.skip("Skipping N <= 512 until performance issues solved.")
+
+    vllm_config = VllmConfig()
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_bf16 = ((torch.rand((E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 *
+               fp8_max).clamp(min=fp8_min, max=fp8_max)
+
+    w2_bf16 = ((torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 *
+               fp8_max).clamp(min=fp8_min, max=fp8_max)
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * N) + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
+
+    w1_s = torch.empty((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+    w2_s = torch.empty((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+
+    w1_s = deep_gemm.get_col_major_tma_aligned_tensor(w1_s).contiguous()
+    w2_s = deep_gemm.get_col_major_tma_aligned_tensor(w2_s).contiguous()
+
+    assert w1_s.shape == (E, (2 * N + 127) // 128, (K + 127) // 128)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(E):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        if M >= 128:
+            ref_out = deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s,
+                                                   score, topk, block_size)
+        else:
+            ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score,
+                                               topk, block_size)
+
+        topk_weights, topk_ids = fused_topk(a, score.float(), topk, False)
+
+        out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
+
+    #print(f"{out.sum()=}")
+    #print(f"{ref_out.sum()=}")

    rel_diff = (torch.mean(
        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
                torch.mean(torch.abs(ref_out.to(torch.float32))))
+
    assert rel_diff < 0.03