Merge tag 'v0.18.0' into v0.18.0-ori

3fb4b5fa · zhuwenwen · bcf25339 · 89138b21 · 3fb4b5fa · 3fb4b5fa
Commit 3fb4b5fa authored Mar 23, 2026 by zhuwenwen
20 changed files
--- a/benchmarks/kernels/bench_int8_gemm.py
+++ b/benchmarks/kernels/bench_int8_gemm.py
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -5,12 +5,14 @@ import time
 import torch
+from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
 @torch.inference_mode()
+@default_vllm_config()
 def main(
    num_tokens: int,
    hidden_size: int,
@@ -32,14 +34,14 @@ def main(
    residual = torch.randn_like(x) * scale if add_residual else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
        for _ in range(num_iters):
            layer(x, residual)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:

--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -1035,7 +1035,7 @@ def bench_optype(
    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Merge into a single kwargs and qualify arguments as ArgPool
    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}

--- a/benchmarks/kernels/benchmark_mla_k_concat.py
+++ b/benchmarks/kernels/benchmark_mla_k_concat.py
@@ -47,13 +47,13 @@ def benchmark_method(
    # Warmup
    for _ in range(num_warmup):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Benchmark
    start = time.perf_counter()
    for _ in range(num_iters):
        _ = method(k_nope, k_pe)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    end = time.perf_counter()
    return (end - start) / num_iters * 1000  # Convert to ms

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -16,6 +16,10 @@ import torch
 from ray.experimental.tqdm_ray import tqdm
 from vllm.model_executor.layers.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -50,7 +54,7 @@ def clear_triton_cache():
    # Clear CUDA memory cache
    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
+        torch.accelerator.empty_cache()
    # Try to clear Triton's runtime cache
    try:
@@ -99,13 +103,38 @@ def benchmark_config(
    dtype: torch.dtype,
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
+    use_int4_w4a16: bool = False,
    num_iters: int = 100,
    block_quant_shape: list[int] = None,
    use_deep_gemm: bool = False,
 ) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    if use_int8_w8a16:
+    if use_int4_w4a16:
+        # Int4 packed weights: 2 int4 values per uint8 byte
+        # K dimension is packed (halved)
+        intermediate_size = shard_intermediate_size // 2  # after silu_and_mul
+        w1 = torch.randint(
+            0,
+            255,
+            (
+                num_experts,
+                shard_intermediate_size,
+                hidden_size // 2,  # int4 packing
+            ),
+            dtype=torch.uint8,
+        )
+        w2 = torch.randint(
+            0,
+            255,
+            (
+                num_experts,
+                hidden_size,
+                intermediate_size // 2,  # int4 packing
+            ),
+            dtype=torch.uint8,
+        )
+    elif use_int8_w8a16:
        w1 = torch.randint(
            -127,
            127,
@@ -139,7 +168,20 @@ def benchmark_config(
    w2_scale = None
    a1_scale = None
    a2_scale = None
-    if use_int8_w8a16:
+    if use_int4_w4a16:
+        if block_quant_shape is None:
+            raise ValueError("block_quant_shape is required for int4_w4a16")
+        group_size = block_quant_shape[1]
+        # Scales shape: (E, N, K // group_size) in fp16
+        w1_scale = torch.rand(
+            (num_experts, shard_intermediate_size, hidden_size // group_size),
+            dtype=dtype,
+        )
+        w2_scale = torch.rand(
+            (num_experts, hidden_size, intermediate_size // group_size),
+            dtype=dtype,
+        )
+    elif use_int8_w8a16:
        w1_scale = torch.randn(
            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
        )
@@ -198,27 +240,38 @@ def benchmark_config(
            a1_scale=a1_scale,
            a2_scale=a2_scale,
            block_shape=block_quant_shape,
+            weight_dtype="int4" if use_int4_w4a16 else None,
        )
        deep_gemm_experts = None
        if use_deep_gemm:
-            deep_gemm_experts = mk.FusedMoEModularKernel(
+            moe_config = (
-                prepare_finalize=MoEPrepareAndFinalizeNoEP(),
+                FusedMoEConfig(
+                    num_experts=num_experts,
+                    experts_per_token=topk,
+                    hidden_dim=hidden_size,
+                    intermediate_size_per_partition=shard_intermediate_size,
+                    num_local_experts=num_experts,
+                    num_logical_experts=num_experts,
+                    activation=MoEActivation.SILU,
+                    moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+                    in_dtype=init_dtype,
+                    routing_method=RoutingMethodType.TopK,
+                    device="cuda",
+                ),
+            )
+            deep_gemm_experts = mk.FusedMoEKernel(
+                prepare_finalize=maybe_make_prepare_finalize(
+                    moe=moe_config,
+                    quant_config=quant_config,
+                    allow_new_interface=True,
+                    use_monolithic=False,
+                ),
                fused_experts=TritonOrDeepGemmExperts(
-                    moe_config=FusedMoEConfig(
+                    moe_config=moe_config,
-                        num_experts=num_experts,
-                        experts_per_token=topk,
-                        hidden_dim=hidden_size,
-                        intermediate_size_per_partition=shard_intermediate_size,
-                        num_local_experts=num_experts,
-                        activation="silu",
-                        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
-                        in_dtype=init_dtype,
-                        routing_method=RoutingMethodType.TopK,
-                        device="cuda",
-                    ),
                    quant_config=quant_config,
                ),
+                inplace=not disable_inplace(),
            )
        with override_config(config):
@@ -226,9 +279,18 @@ def benchmark_config(
                x, input_gating, topk, renormalize=not use_deep_gemm
            )
+            inplace = not disable_inplace()
            if use_deep_gemm:
-                return deep_gemm_experts(
+                return deep_gemm_experts.apply(
-                    x, w1, w2, topk_weights, topk_ids, inplace=True
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    activation=MoEActivation.SILU,
+                    global_num_experts=num_experts,
+                    apply_router_weight_on_input=False,
+                    expert_map=False,
                )
            return fused_experts(
                x,
@@ -236,25 +298,25 @@ def benchmark_config(
                w2,
                topk_weights,
                topk_ids,
-                inplace=True,
+                inplace=inplace,
                quant_config=quant_config,
            )
    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -262,7 +324,7 @@ def benchmark_config(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
@@ -478,6 +540,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
+        use_int4_w4a16: bool = False,
        block_quant_shape: list[int] = None,
        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
@@ -485,7 +548,10 @@ class BenchmarkWorker:
        set_random_seed(self.seed)
        dtype_str = _get_config_dtype_str(
-            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+            dtype,
+            use_int8_w8a16=use_int8_w8a16,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int4_w4a16=use_int4_w4a16,
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
@@ -516,6 +582,7 @@ class BenchmarkWorker:
            dtype,
            use_fp8_w8a8,
            use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
            num_iters=100,
            block_quant_shape=block_quant_shape,
            use_deep_gemm=use_deep_gemm,
@@ -532,6 +599,7 @@ class BenchmarkWorker:
        dtype: torch.dtype,
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
+        use_int4_w4a16: bool,
        search_space: list[dict[str, int]],
        block_quant_shape: list[int],
        use_deep_gemm: bool,
@@ -542,7 +610,7 @@ class BenchmarkWorker:
        best_config = None
        best_time = float("inf")
        if current_platform.is_rocm():
-            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
            search_space = prune_rocm_search_space(
                num_tokens,
                shard_intermediate_size,
@@ -558,7 +626,11 @@ class BenchmarkWorker:
            if visible_device != f"{self.device_id}":
                need_device_guard = True
-        with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
+        with (
+            torch.accelerator.device_index(self.device_id)
+            if need_device_guard
+            else nullcontext()
+        ):
            for idx, config in enumerate(tqdm(search_space)):
                try:
                    kernel_time = benchmark_config(
@@ -571,6 +643,7 @@ class BenchmarkWorker:
                        dtype,
                        use_fp8_w8a8,
                        use_int8_w8a16,
+                        use_int4_w4a16,
                        num_iters=20,
                        block_quant_shape=block_quant_shape,
                        use_deep_gemm=use_deep_gemm,
@@ -618,6 +691,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
            else {}
        ),
        **({"kpack": config["kpack"]} if "kpack" in config else {}),
+        **({"SPLIT_K": config["SPLIT_K"]} if "SPLIT_K" in config else {}),
    }
@@ -630,11 +704,15 @@ def save_configs(
    dtype: torch.dtype,
    use_fp8_w8a8: bool,
    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
    block_quant_shape: list[int],
    save_dir: str,
 ) -> None:
    dtype_str = _get_config_dtype_str(
-        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+        dtype,
+        use_int8_w8a16=use_int8_w8a16,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int4_w4a16=use_int4_w4a16,
    )
    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
@@ -736,6 +814,38 @@ def get_model_params(config):
    return E, topk, intermediate_size, hidden_size
+def get_quantization_group_size(config) -> int | None:
+    """Extract the quantization group size from the HF model config.
+    This reads directly from the HuggingFace config object (as returned by
+    ``get_config()``), not from vLLM's quantization config classes.
+    Supports AWQ/GPTQ-style configs (direct 'group_size' key) and
+    compressed-tensors configs (nested inside 'config_groups').
+    """
+    quantization_config = getattr(config, "quantization_config", {})
+    if not isinstance(quantization_config, dict):
+        return None
+    # AWQ / GPTQ style: group_size is a top-level key
+    gs = quantization_config.get("group_size")
+    if gs is not None:
+        return gs
+    # compressed-tensors style: group_size is nested in config_groups
+    config_groups = quantization_config.get("config_groups", {})
+    if not isinstance(config_groups, dict):
+        return None
+    for group_cfg in config_groups.values():
+        if not isinstance(group_cfg, dict):
+            continue
+        weights = group_cfg.get("weights", {})
+        if not isinstance(weights, dict):
+            continue
+        gs = weights.get("group_size")
+        if gs is not None:
+            return gs
+    return None
 def main(args: argparse.Namespace):
    print(args)
@@ -754,7 +864,20 @@ def main(args: argparse.Namespace):
    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    use_int4_w4a16 = args.dtype == "int4_w4a16"
    block_quant_shape = get_weight_block_size_safety(config)
+    if use_int4_w4a16:
+        group_size = get_quantization_group_size(config)
+        if group_size is None:
+            raise ValueError(
+                "Could not determine group_size from model config. "
+                "The model's quantization_config must contain a 'group_size' "
+                "field (AWQ/GPTQ) or 'config_groups.*.weights.group_size' "
+                "(compressed-tensors)."
+            )
+        # For int4_w4a16, block_shape = [0, group_size]
+        # block_shape[0]=0 means no block quantization on N dimension
+        block_quant_shape = [0, group_size]
    if args.batch_size is None:
        batch_sizes = [
@@ -808,8 +931,20 @@ def main(args: argparse.Namespace):
        return ray.get(outputs)
    if args.tune:
-        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        # int4_w4a16 weights are uint8-packed, not fp16; treat like fp8 for
-        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
+        # search space generation (no matrix_instr_nonkdim/kpack exploration).
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
+        # For int4_w4a16, the group_size constraint on BLOCK_SIZE_K does not
+        # apply: the gptq_awq kernel handles arbitrary BLOCK_SIZE_K regardless
+        # of group_size. Skip block_quant_shape filtering to keep the full
+        # search space (e.g. BLOCK_SIZE_K=64 with group_size=128).
+        tune_block_quant_shape = None if use_int4_w4a16 else block_quant_shape
+        search_space = get_configs_compute_bound(is_fp16, tune_block_quant_shape)
+        if use_int4_w4a16:
+            # SPLIT_K is a required kernel constexpr for gptq_awq kernel;
+            # only SPLIT_K=1 is used at runtime, so fix it during tuning.
+            for cfg in search_space:
+                cfg["SPLIT_K"] = 1
        print(f"Start tuning over {len(search_space)} configurations...")
        if use_deep_gemm:
            raise ValueError(
@@ -829,6 +964,7 @@ def main(args: argparse.Namespace):
                    dtype,
                    use_fp8_w8a8,
                    use_int8_w8a16,
+                    use_int4_w4a16,
                    search_space,
                    block_quant_shape,
                    use_deep_gemm,
@@ -848,6 +984,7 @@ def main(args: argparse.Namespace):
            dtype,
            use_fp8_w8a8,
            use_int8_w8a16,
+            use_int4_w4a16,
            block_quant_shape,
            args.save_dir,
        )
@@ -866,6 +1003,7 @@ def main(args: argparse.Namespace):
                    dtype,
                    use_fp8_w8a8,
                    use_int8_w8a16,
+                    use_int4_w4a16,
                    block_quant_shape,
                    use_deep_gemm,
                )
@@ -888,7 +1026,10 @@ if __name__ == "__main__":
    )
    parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
    parser.add_argument(
-        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
+        "--dtype",
+        type=str,
+        choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"],
+        default="auto",
    )
    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument(

--- a/benchmarks/kernels/benchmark_moe_defaults.py
+++ b/benchmarks/kernels/benchmark_moe_defaults.py
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark comparing old vs new default fused MoE configs.
+Runs the triton fused_moe kernel with three configurations for each scenario:
+  1. Tuned config (from JSON file, if available) — the target to match
+  2. Old default (the hardcoded defaults before this change)
+  3. New default (the improved defaults)
+Usage:
+    python benchmarks/kernels/benchmark_moe_defaults.py
+Produces a table showing kernel time (us) and speedup of new vs old defaults.
+"""
+import torch
+from vllm.model_executor.layers.fused_moe import fused_topk, override_config
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_experts,
+    get_default_config,
+    get_moe_configs,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton
+from vllm.utils.torch_utils import set_random_seed
+FP8_DTYPE = current_platform.fp8_dtype()
+def old_default_config(M, E, N, K, topk, dtype=None, block_shape=None):
+    """The original defaults before https://github.com/vllm-project/vllm/pull/34846,
+    for comparison."""
+    if dtype == "fp8_w8a8" and block_shape is not None:
+        return {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_shape[0],
+            "BLOCK_SIZE_K": block_shape[1],
+            "GROUP_SIZE_M": 32,
+            "SPLIT_K": 1,
+            "num_warps": 4,
+            "num_stages": 3 if not current_platform.is_rocm() else 2,
+        }
+    elif M <= E:
+        return {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 1,
+            "SPLIT_K": 1,
+        }
+    else:
+        return {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+        }
+def benchmark_config(
+    config,
+    M,
+    E,
+    N,
+    K,
+    topk,
+    dtype,
+    use_fp8=False,
+    block_shape=None,
+    num_iters=100,
+):
+    """Time a single kernel config. Returns kernel time in microseconds."""
+    init_dtype = torch.float16 if use_fp8 else dtype
+    a = torch.randn(M, K, device="cuda", dtype=init_dtype) / 10
+    w1 = torch.randn(E, 2 * N, K, device="cuda", dtype=init_dtype) / 10
+    w2 = torch.randn(E, K, N, device="cuda", dtype=init_dtype) / 10
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    if use_fp8:
+        if block_shape is not None:
+            bsn, bsk = block_shape
+            n_tiles_w1 = triton.cdiv(2 * N, bsn)
+            k_tiles_w1 = triton.cdiv(K, bsk)
+            n_tiles_w2 = triton.cdiv(K, bsn)
+            k_tiles_w2 = triton.cdiv(N, bsk)
+            w1_scale = torch.rand(
+                E, n_tiles_w1, k_tiles_w1, device="cuda", dtype=torch.float32
+            )
+            w2_scale = torch.rand(
+                E, n_tiles_w2, k_tiles_w2, device="cuda", dtype=torch.float32
+            )
+        else:
+            w1_scale = torch.rand(E, device="cuda", dtype=torch.float32)
+            w2_scale = torch.rand(E, device="cuda", dtype=torch.float32)
+        a1_scale = torch.rand(1, device="cuda", dtype=torch.float32)
+        a2_scale = torch.rand(1, device="cuda", dtype=torch.float32)
+        # Only weights are stored in fp8; activations stay in bf16/fp16
+        # and get dynamically quantized inside the kernel.
+        w1 = w1.to(FP8_DTYPE)
+        w2 = w2.to(FP8_DTYPE)
+    quant_config = FusedMoEQuantConfig.make(
+        quant_dtype=torch.float8_e4m3fn if use_fp8 else None,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
+    gating = torch.randn(M, E, device="cuda", dtype=torch.float32)
+    # Warmup
+    for _ in range(20):
+        with override_config(config):
+            topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
+            fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                quant_config=quant_config,
+            )
+    torch.accelerator.synchronize()
+    # Benchmark
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(num_iters):
+        with override_config(config):
+            topk_weights, topk_ids, _ = fused_topk(a, gating, topk, renormalize=True)
+            fused_experts(
+                a,
+                w1,
+                w2,
+                topk_weights,
+                topk_ids,
+                quant_config=quant_config,
+            )
+    end.record()
+    torch.accelerator.synchronize()
+    return start.elapsed_time(end) / num_iters * 1000  # ms -> us
+# Model configurations: (name, E, N, K, topk, dtype_str, use_fp8, block_shape)
+# N = moe_intermediate_size // tp_size (the value used in config file lookup)
+MODELS = [
+    # --- Few experts ---
+    ("Mixtral bf16", 8, 7168, 4096, 2, None, False, None),
+    ("Mixtral fp8", 8, 7168, 4096, 2, "fp8_w8a8", True, None),
+    # --- Many experts: real model shapes at tp=1 ---
+    # Qwen2-MoE-57B: E=60, topk=4, N=1408, K=2048
+    ("Qwen2-MoE bf16", 60, 1408, 2048, 4, None, False, None),
+    # DeepSeek-V2: E=64, topk=6, N=1407, K=4096
+    # (use 1408 to avoid odd alignment; real model is 1407)
+    ("DeepSeek-V2 bf16", 64, 1408, 4096, 6, None, False, None),
+    # OLMoE-7B: E=64, topk=8, N=2048, K=2048
+    ("OLMoE bf16", 64, 2048, 2048, 8, None, False, None),
+    # GLM-4-100B-A10B: E=128, topk=8, N=1408, K=4096
+    ("GLM-4-MoE bf16", 128, 1408, 4096, 8, None, False, None),
+    # Qwen3-30B-A3B: E=128, topk=8, N=768, K=2048
+    ("Qwen3-MoE bf16", 128, 768, 2048, 8, None, False, None),
+    # DeepSeek-V3 / MiMo-V2-Flash: E=256, topk=8, N=2048, K=7168
+    ("DeepSeek-V3 bf16", 256, 2048, 7168, 8, None, False, None),
+    # Qwen3.5-70B-A22B (Qwen3-Next): E=512, topk=10, N=512, K=2048
+    ("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
+    # E=128 N=1856 bf16
+    ("E128 N1856 bf16", 128, 1856, 4096, 8, None, False, None),
+    # E=256 N=512 bf16 (DS-V3 tp=4)
+    ("DS-V3 tp4 bf16", 256, 512, 7168, 8, None, False, None),
+    # E=512 N=512 bf16 (Qwen3-Next tp=1)
+    ("Qwen3-Next bf16", 512, 512, 2048, 10, None, False, None),
+    # E=512 N=256 bf16 (Qwen3-Next tp=2)
+    ("Qwen3-Next tp2", 512, 256, 2048, 10, None, False, None),
+    # --- FP8 block quant (many experts) ---
+    # DS-V3 tp=4: E=256, N=512, fp8 block
+    ("DS-V3 tp4 fp8blk", 256, 512, 7168, 8, "fp8_w8a8", True, [128, 128]),
+    # DS-V3 tp=8: E=256, N=256, fp8 block
+    ("DS-V3 tp8 fp8blk", 256, 256, 7168, 8, "fp8_w8a8", True, [128, 128]),
+    # Qwen3-Next tp=2 fp8 block
+    ("Qwen3-Next tp2 fp8blk", 512, 256, 2048, 10, "fp8_w8a8", True, [128, 128]),
+]
+BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+def main():
+    set_random_seed(0)
+    torch.set_default_device("cuda")
+    dtype = torch.bfloat16
+    for name, E, N, K, topk, dtype_str, use_fp8, block_shape in MODELS:
+        print(f"\n{'=' * 90}")
+        print(f"  {name}  (E={E}, N={N}, K={K}, topk={topk})")
+        print(f"{'=' * 90}")
+        # Try to load tuned config
+        block_n = block_shape[0] if block_shape else None
+        block_k = block_shape[1] if block_shape else None
+        tuned = get_moe_configs(E, N, dtype_str, block_n, block_k)
+        has_tuned = tuned is not None
+        print(f"  Tuned config available: {has_tuned}")
+        hdr = (
+            f"{'Batch':>6} | {'Tuned (us)':>11} | {'Old (us)':>11} | "
+            f"{'New (us)':>11} | {'New/Old':>8} | {'New/Tuned':>10}"
+        )
+        print(f"  {hdr}")
+        print(f"  {'-' * len(hdr)}")
+        for M in BATCH_SIZES:
+            old_cfg = old_default_config(M, E, N, K, topk, dtype_str, block_shape)
+            new_cfg = get_default_config(M, E, N, K, topk, dtype_str, block_shape)
+            if has_tuned:
+                tuned_cfg = tuned[min(tuned.keys(), key=lambda x: abs(x - M))]
+                t_tuned = benchmark_config(
+                    tuned_cfg,
+                    M,
+                    E,
+                    N,
+                    K,
+                    topk,
+                    dtype,
+                    use_fp8=use_fp8,
+                    block_shape=block_shape,
+                )
+            else:
+                t_tuned = None
+            t_old = benchmark_config(
+                old_cfg,
+                M,
+                E,
+                N,
+                K,
+                topk,
+                dtype,
+                use_fp8=use_fp8,
+                block_shape=block_shape,
+            )
+            t_new = benchmark_config(
+                new_cfg,
+                M,
+                E,
+                N,
+                K,
+                topk,
+                dtype,
+                use_fp8=use_fp8,
+                block_shape=block_shape,
+            )
+            ratio_new_old = t_new / t_old
+            tuned_str = f"{t_tuned:11.2f}" if t_tuned else f"{'N/A':>11}"
+            ratio_tuned = f"{t_new / t_tuned:10.2f}x" if t_tuned else f"{'N/A':>10}"
+            # flag regressions where new default is >5% slower than old
+            marker = " <--" if ratio_new_old > 1.05 else ""
+            print(
+                f"  {M:>6} | {tuned_str} | {t_old:11.2f} | {t_new:11.2f} "
+                f"| {ratio_new_old:7.2f}x | {ratio_tuned}{marker}"
+            )
+if __name__ == "__main__":
+    main()
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -72,19 +72,19 @@ def benchmark_permute(
    # JIT compilation & warmup
    run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -92,7 +92,7 @@ def benchmark_permute(
    latencies: list[float] = []
    for i in range(num_iters):
        prepare(i)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
@@ -185,26 +185,26 @@ def benchmark_unpermute(
    # JIT compilation & warmup
    input = prepare()
    run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Capture 10 invocations with CUDA graph
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        for _ in range(10):
            run(input)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Warmup
    for _ in range(5):
        graph.replay()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
    latencies: list[float] = []
    for i in range(num_iters):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        graph.replay()
        end_event.record()

--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -36,6 +36,7 @@ from typing import Any
 import numpy as np
 import torch
+from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.transformers_utils.config import get_config
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -78,6 +79,7 @@ def calculate_stats(times: list[float]) -> dict[str, float]:
    }
+@default_vllm_config()
 def benchmark_mrope(
    model_name: str,
    num_tokens: int,
@@ -133,14 +135,14 @@ def benchmark_mrope(
            key.clone(),
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Time reference implementation
    torch_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_native(
@@ -149,7 +151,7 @@ def benchmark_mrope(
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        torch_times.append(time.time() - start_time)
    # Time triton kernel implementation
@@ -157,14 +159,14 @@ def benchmark_mrope(
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.time()
        mrope_helper_class.forward_cuda(
            positions,
            query_clone,
            key_clone,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        triton_times.append(time.time() - start_time)
    # Calculate statistics

--- a/benchmarks/kernels/bench_mxfp4_qutlass.py
+++ b/benchmarks/kernels/bench_mxfp4_qutlass.py
--- a/benchmarks/kernels/bench_nvfp4_gemm.py
+++ b/benchmarks/kernels/bench_nvfp4_gemm.py
--- a/benchmarks/kernels/bench_nvfp4_quant.py
+++ b/benchmarks/kernels/bench_nvfp4_quant.py
--- a/benchmarks/kernels/bench_nvfp4_qutlass.py
+++ b/benchmarks/kernels/bench_nvfp4_qutlass.py
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -103,7 +103,7 @@ def main(
        max_logits = torch.empty_like(exp_sums)
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -173,7 +173,7 @@ def main(
                    )
            else:
                raise ValueError(f"Invalid version: {version}")
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:

--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -28,7 +28,7 @@ def _time_cuda(
    # warmup
    for _ in range(warmup_iters):
        fn()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start = torch.Event(enable_timing=True)
    end = torch.Event(enable_timing=True)
@@ -37,7 +37,7 @@ def _time_cuda(
    for _ in range(bench_iters):
        fn()
    end.record()
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    return start.elapsed_time(end) / bench_iters  # ms/iter

--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -7,6 +7,7 @@ from unittest.mock import patch
 import pandas as pd
 import torch
+from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
@@ -84,6 +85,7 @@ def calculate_diff(
 configs = []
+@default_vllm_config()
 def benchmark_quantization(
    batch_size,
    hidden_size,

--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -29,7 +29,7 @@ def main(
    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
        start_time = time.perf_counter()
@@ -39,7 +39,7 @@ def main(
                ops.scaled_int8_quant(x, scale)
            else:
                ops.scaled_fp8_quant(x, scale)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        end_time = time.perf_counter()
        if profile:

--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -84,16 +84,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@@ -104,7 +104,7 @@ def run_benchmark(
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    return lat

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -109,16 +109,16 @@ def run_benchmark(
        g = torch.cuda.CUDAGraph()
        with torch.cuda.graph(g):
            function_under_test()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        function_under_test = lambda: g.replay()
    def run_cuda_benchmark(n_iters: int) -> float:
        nonlocal key, value, key_cache, value_cache, slot_mapping
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start = time.perf_counter()
        for _ in range(n_iters):
            function_under_test()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
        end = time.perf_counter()
        return (end - start) / n_iters
@@ -129,7 +129,7 @@ def run_benchmark(
    # free tensors to mitigate OOM when sweeping
    del key, value, key_cache, value_cache, slot_mapping
-    torch.cuda.empty_cache()
+    torch.accelerator.empty_cache()
    return lat

--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -5,6 +5,7 @@ import itertools
 import torch
+from vllm.benchmarks.lib.utils import default_vllm_config
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.triton_utils import triton
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -29,6 +30,7 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device):
            args={},
        )
    )
+    @default_vllm_config()
    def benchmark(batch_size, seq_len, num_heads, provider):
        dtype = torch.bfloat16
        max_position = 8192

--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -251,7 +251,7 @@ def benchmark(
        kernel(
            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
        )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    start_event = torch.Event(enable_timing=True)
    end_event = torch.Event(enable_timing=True)
@@ -259,7 +259,7 @@ def benchmark(
    # Benchmark
    latencies: list[float] = []
    for _ in range(runs):
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_event.record()
        for i in range(iterations_per_run):