added vllm092 auto test scripts

d1a06223 · liuxu3 · fba2e3b5 · d1a06223 · d1a06223 · d1a06223
Commit d1a06223 authored Feb 24, 2026 by liuxu3
20 changed files
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_machete.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_machete.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import copy
+import itertools
+import math
+import os
+import pickle as pkl
+import time
+from collections.abc import Iterable
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Optional
+import pandas as pd
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL,
+    GPTQ_MARLIN_MIN_THREAD_N,
+    marlin_permute_scales,
+    marlin_zero_points,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_rows,
+    quantize_weights,
+)
+from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils import FlexibleArgumentParser
+DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
+DEFAULT_TP_SIZES = [1]
+NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False)
+if NVTX_PROFILE:
+    import nvtx
+def terse_type_name(dt):
+    return {
+        torch.bfloat16: "bf16",
+        torch.float16: "fp16",
+        torch.int8: "int8",
+        torch.float8_e4m3fn: "fp8",
+        torch.float: "float",
+        torch.int: "int",
+    }[dt]
+@dataclass
+class BenchmarkTensors:
+    w_ref: torch.Tensor
+    a: torch.Tensor
+    w_q: torch.Tensor
+    group_size: Optional[int]
+    wtype: ScalarType
+    w_g_s: torch.Tensor
+    w_g_zp: Optional[torch.Tensor]
+    w_ch_s: Optional[torch.Tensor]
+    w_tok_s: Optional[torch.Tensor]
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    group_zero_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+def rand_data(shape, dtype=torch.float16, scale=1):
+    if dtype.is_floating_point:
+        return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype)
+    else:
+        return torch.randint(-15, 15, shape, dtype=dtype, device="cuda")
+def quantize_and_pack(
+    atype: torch.dtype,
+    w: torch.Tensor,
+    wtype: ScalarType,
+    stype: Optional[torch.dtype],
+    group_size: Optional[int],
+    zero_points: bool = False,
+):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w,
+        wtype,
+        group_size=group_size,
+        zero_points=zero_points,
+        # to match how the kernel applies zps
+        ref_zero_points_after_scales=True,
+    )
+    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
+    return w_ref, w_q, w_s, w_zp
+def create_bench_tensors(
+    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+) -> list[BenchmarkTensors]:
+    m, n, k = shape
+    # we want to make sure that weights don't fit into L2 cache between runs so
+    #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
+    #  so we target total weight size > 2*50mb
+    num_weights = math.ceil(
+        2 * 50 * 1024**2 * 8 / (k * n * types.weight_type.size_bits)
+    )
+    a = rand_data((m, k), types.act_type, scale=5)
+    benchmark_tensors: list[BenchmarkTensors] = []
+    for _ in range(num_weights):
+        w = rand_data((k, n), types.act_type, scale=5)
+        if types.group_scale_type is not None:
+            w = w.to(types.group_scale_type)
+        if w.dtype.itemsize == 1:
+            w = w.to(torch.float16)
+        w_ref, w_q_packed, w_s, w_zp = quantize_and_pack(
+            a.dtype,
+            w,
+            types.weight_type,
+            types.group_scale_type,
+            group_size,
+            types.group_zero_type is not None,
+        )
+        if not a.dtype.is_floating_point:
+            aiinfo = torch.iinfo(a.dtype)
+            w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
+        w_ref = w_ref.to(torch.float32)
+        w_ch_s = (
+            None
+            if types.channel_scale_type is None
+            else rand_data((n,), types.channel_scale_type)
+        )
+        w_tok_s = (
+            None
+            if types.token_scale_type is None
+            else rand_data((m,), types.token_scale_type)
+        )
+        benchmark_tensors.append(
+            BenchmarkTensors(
+                w_ref=w_ref,
+                a=a,
+                w_q=w_q_packed,
+                wtype=types.weight_type,
+                w_g_s=w_s,
+                w_g_zp=w_zp,
+                group_size=group_size,
+                w_ch_s=w_ch_s,
+                w_tok_s=w_tok_s,
+            )
+        )
+    return benchmark_tensors
+def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    a = bt.a
+    w = bt.w_ref.to(bt.a.dtype)  # use float reference tensor
+    if a.dtype not in [torch.float16, torch.bfloat16]:
+        a = a.to(torch.float16)
+        w = w.to(torch.float16)
+    return lambda: torch.matmul(a, w)
+def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    if bt.w_ch_s is not None and bt.w_tok_s is not None:
+        scale_a = bt.w_tok_s.to(torch.float32)
+        scale_b = bt.w_ch_s.to(torch.float32)
+    else:
+        scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+        scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+    w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t()
+    return lambda: ops.cutlass_scaled_mm(
+        bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16
+    )
+def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    device = bt.a.device
+    workspace = MarlinWorkspace(
+        bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
+    )
+    if bt.w_g_zp is None:
+        w_zp = torch.empty(0, dtype=torch.int, device=device)
+    else:
+        w_zp = marlin_zero_points(
+            bt.w_g_zp, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits
+        )
+    if bt.group_size is None:
+        w_s = torch.tensor([], device="cuda", dtype=torch.half)
+    else:
+        w_s = marlin_permute_scales(
+            bt.w_g_s, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.group_size
+        )
+    sort_indices = torch.empty(0, dtype=torch.int, device=device)
+    g_idx = torch.empty(0, dtype=torch.int, device=device)
+    w_q = ops.gptq_marlin_repack(
+        bt.w_q, sort_indices, bt.w_ref.shape[0], bt.w_ref.shape[1], bt.wtype.size_bits
+    )
+    if bt.a.dtype.is_floating_point:
+        assert bt.w_ch_s is None
+        assert bt.w_tok_s is None
+        assert bt.group_size is not None
+        fn = lambda: ops.gptq_marlin_gemm(
+            a=bt.a,
+            c=None,
+            b_q_weight=w_q,
+            b_scales=w_s,
+            global_scale=None,
+            b_zeros=w_zp,
+            g_idx=g_idx,
+            perm=sort_indices,
+            workspace=workspace.scratch,
+            b_q_type=bt.wtype,
+            size_m=bt.a.shape[0],
+            size_n=bt.w_ref.shape[1],
+            size_k=bt.w_ref.shape[0],
+            is_k_full=True,
+            is_zp_float=False,
+        )
+    else:
+        assert bt.a.dtype == torch.int8
+        assert bt.wtype == scalar_types.uint4b8
+        if bt.w_ch_s is not None:
+            s_ch = bt.w_ch_s.to(torch.float32)
+        else:
+            s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device)
+        if bt.w_tok_s is not None:
+            s_tok = bt.w_tok_s.to(torch.float32)
+        else:
+            s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device)
+        fn = lambda: ops.marlin_qqq_gemm(
+            a=bt.a,
+            b_q_weight=w_q,
+            s_group=w_s,
+            s_tok=s_tok,
+            s_ch=s_ch,
+            workspace=workspace.scratch,
+            size_m=bt.a.shape[0],
+            size_n=bt.w_ref.shape[1],
+            size_k=bt.w_ref.shape[0],
+        )
+    return fn
+def machete_create_bench_fn(
+    bt: BenchmarkTensors, out_type=torch.dtype, schedule=None
+) -> Callable:
+    w_q = bt.w_q.t().contiguous().t()  # make col major
+    w_q = ops.machete_prepack_B(
+        w_q, bt.a.dtype, bt.wtype, None if bt.w_g_s is None else bt.w_g_s.dtype
+    )
+    w_g_zp = bt.w_g_zp
+    if w_g_zp is not None:
+        w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype))
+    return lambda: ops.machete_mm(
+        a=bt.a,
+        b_q=w_q,
+        b_type=bt.wtype,
+        b_group_scales=bt.w_g_s,
+        b_group_zeros=w_g_zp,
+        b_group_size=bt.group_size,
+        b_channel_scales=bt.w_ch_s,
+        a_token_scales=bt.w_tok_s,
+        out_type=out_type,
+        schedule=schedule,
+    )
+# impl
+# bench
+def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]):
+    min_run_time = 1 if not NVTX_PROFILE else 0.1
+    res = TBenchmark.Timer(
+        stmt="""
+        for fn in fns:
+            fn()
+        """,
+        globals={"fns": fns},
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+    if NVTX_PROFILE:
+        with (
+            nvtx.annotate("mm-bench"),
+            nvtx.annotate(f"{label}|{sub_label}|{description}"),
+        ):
+            fns[0]()
+    return res
+_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
+_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+def bench(
+    types: TypeConfig,
+    group_size: int,
+    m: int,
+    k: int,
+    n: int,
+    label: str,
+    sub_label: str,
+    sweep_schedules: bool = True,
+) -> list[TMeasurement]:
+    benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
+    sub_label += f", L={len(benchmark_tensors)}"
+    name_type_string = f"W{types.weight_type}" + f"-A{terse_type_name(types.act_type)}"
+    if types.group_scale_type is not None:
+        name_type_string += f"-GS{terse_type_name(types.group_scale_type)}"
+    if types.group_zero_type is not None:
+        name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}"
+    if group_size is not None:
+        name_type_string += f"-G{group_size}"
+    if types.channel_scale_type is not None:
+        name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}"
+    if types.token_scale_type is not None:
+        name_type_string += f"-TS{terse_type_name(types.token_scale_type)}"
+    timers = []
+    # pytorch impl
+    timers.append(
+        bench_fns(
+            label,
+            sub_label,
+            "torch.matmul (fp16)",
+            [torch_matmul_f16_create_bench_fn(bt) for bt in benchmark_tensors],
+        )
+    )
+    if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn:
+        timers.append(
+            bench_fns(
+                label,
+                sub_label,
+                f"cutlass_scaled_mm ({terse_type_name(types.act_type)})",
+                [cutlass_scaled_mm_create_bench_fn(bt) for bt in benchmark_tensors],
+            )
+        )
+    if types.act_type != torch.float8_e4m3fn:
+        timers.append(
+            bench_fns(
+                label,
+                sub_label,
+                f"marlin ({name_type_string})",
+                [marlin_create_bench_fn(bt) for bt in benchmark_tensors],
+            )
+        )
+    # machete
+    timers.append(
+        bench_fns(
+            label,
+            sub_label,
+            f"machete ({name_type_string})",
+            [
+                machete_create_bench_fn(bt, out_type=types.output_type)
+                for bt in benchmark_tensors
+            ],
+        )
+    )
+    if sweep_schedules:
+        global _SWEEP_SCHEDULES_RESULTS
+        print("Finding best schedule for machete")
+        best = None
+        best_schedule = None
+        schedules = ops.machete_supported_schedules(
+            a_type=types.act_type,
+            b_type=types.weight_type,
+            group_scales_type=types.group_scale_type,
+            group_zeros_type=types.group_zero_type,
+            token_scales_type=types.token_scale_type,
+            channel_scales_type=types.channel_scale_type,
+            out_type=types.output_type,
+        )
+        if schedules is None or len(schedules) == 0:
+            raise ValueError("No schedules found to sweep")
+        for schedule in reversed(schedules):
+            schedule_M = int(schedule.split("_")[0].split("x")[1])
+            # Prune known bad schedules
+            if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
+                continue
+            res = bench_fns(
+                label,
+                sub_label,
+                "machete_best",
+                [
+                    machete_create_bench_fn(
+                        bt, out_type=types.output_type, schedule=schedule
+                    )
+                    for bt in benchmark_tensors
+                ],
+            )
+            results_row = {
+                "M": m,
+                "K": k,
+                "N": n,
+                "group_size": group_size,
+                "schedule": schedule,
+                "median": res.median,
+            }
+            if _SWEEP_SCHEDULES_RESULTS is None:
+                _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(columns=results_row.keys())
+            _SWEEP_SCHEDULES_RESULTS.loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
+            print(f"  {res.median:5.5} ", schedule)
+            if not best or res.median < best.median:
+                best = res
+                best_schedule = schedule
+        print("Best schedule:", best_schedule)
+        timers.append(best)
+    return timers
+# runner
+def print_timers(timers: list[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    types = TypeConfig(
+        act_type=args.act_type,
+        weight_type=scalar_types.uint4b8
+        if args.group_zero_type is None
+        else scalar_types.uint4,
+        output_type=args.out_type,
+        group_scale_type=args.group_scale_type,
+        group_zero_type=args.group_zero_type,
+        channel_scale_type=args.channel_scale_type,
+        token_scale_type=args.token_scale_type,
+    )
+    results: list[TMeasurement] = []
+    for m, k, n in MKNs:
+        timers = bench(
+            types,
+            args.group_size,
+            m,
+            k,
+            n,
+            f"{args.act_type}-gemm",
+            f"MKN=({m}x{k}x{n})",
+            sweep_schedules=args.sweep_schedules,
+        )
+        print_timers(timers)
+        results.extend(timers)
+    return results
+# output makers
+def make_output(
+    data: list[TMeasurement],
+    MKNs: Iterable[tuple[int, int, int]],
+    base_description: str,
+    timestamp=None,
+):
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+# argparse runners
+def run_square_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, args.sweep_schedules, MKNs)
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+def run_range_bench(args):
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
+    m_increment, k_increment, n_increment = (
+        int(x) for x in args.dim_increment.split(",")
+    )
+    Ms = list(range(m_start, m_end + 1, m_increment))
+    Ks = list(range(k_start, k_end + 1, k_increment))
+    Ns = list(range(n_start, n_end + 1, n_increment))
+    MKNs = list(product(Ms, Ks, Ns))
+    data = run(args.dtype, args.sweep_schedules, MKNs)
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+def run_model_bench(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+        data = run(args, MKNs)
+        model_bench_data.append(data)
+    type_string = f"{args.act_type}"
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {type_string} {model}-TP{tp_size} ====")
+        print_timers(data)
+    timestr = time.strftime("%Y%m%d-%H%M%S")
+    all_results = []
+    for d in model_bench_data:
+        all_results.extend(d)
+    # pickle all data
+    with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f:
+        args_dict = vars(args)
+        args_dict.pop("func")
+        pkl.dump(
+            {
+                "args": args_dict,
+                "results": all_results,
+            },
+            f,
+        )
+if __name__ == "__main__":
+    def to_torch_dtype(dt):
+        return {
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+            "int8": torch.int8,
+            "float8_e4m3fn": torch.float8_e4m3fn,
+            "int": torch.int,
+            "float": torch.float,
+        }[dt]
+    class ToTorchDtype(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            setattr(namespace, self.dest, to_torch_dtype(values))
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Machete GEMM.
+    To run square GEMMs:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    To run dimensions from a model:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--act-type",
+        action=ToTorchDtype,
+        required=True,
+        choices=["bfloat16", "float16", "int8", "float8_e4m3fn"],
+    )
+    parser.add_argument(
+        "--group-scale-type",
+        action=ToTorchDtype,
+        choices=["bfloat16", "float16"],
+    )
+    parser.add_argument(
+        "--group-zero-type",
+        type=to_torch_dtype,
+        choices=["bfloat16", "float16"],
+    )
+    parser.add_argument(
+        "--channel-scale-type",
+        action=ToTorchDtype,
+        choices=["float"],
+    )
+    parser.add_argument(
+        "--token-scale-type",
+        action=ToTorchDtype,
+        choices=["float"],
+    )
+    parser.add_argument(
+        "--out-type",
+        action=ToTorchDtype,
+        choices=["bfloat16", "float16"],
+    )
+    parser.add_argument(
+        "--group-size",
+        type=int,
+        help="Available options are ['None', '-1', '128'], default=128",
+        default=128,
+    )
+    parser.add_argument(
+        "--sweep-schedules",
+        action="store_true",
+        help="Run a sweep over all supported schedules",
+    )
+    parser.add_argument(
+        "--sweep-csv-out",
+        help="CSV to store sweep results",
+        default="sch_sweep_results.csv",
+    )
+    subparsers = parser.add_subparsers(dest="cmd", required=True)
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument(
+        "--dim-start",
+        type=str,
+        required=True,
+        help="Start value for M,K,N as common separated list",
+    )
+    range_parser.add_argument(
+        "--dim-end",
+        type=str,
+        required=True,
+        help="End value (inclusive) for M,K,N as common separated list",
+    )
+    range_parser.add_argument(
+        "--dim-increment",
+        type=str,
+        required=True,
+        help="Increment value for M,K,N as common separated list",
+    )
+    range_parser.set_defaults(func=run_range_bench)
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    model_parser.add_argument(
+        "--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
+    )
+    model_parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    model_parser.set_defaults(func=run_model_bench)
+    args = parser.parse_args()
+    _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
+    args.func(args)
+    if _SWEEP_SCHEDULES_RESULTS is not None:
+        _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_marlin.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_marlin.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL,
+    GPTQ_MARLIN_24_MIN_THREAD_N,
+    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
+    GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
+)
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    ALLSPARK_SUPPORTED_QUANT_TYPES,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL,
+    GPTQ_MARLIN_MIN_THREAD_N,
+    MARLIN_SUPPORTED_GROUP_SIZES,
+    query_marlin_supported_quant_types,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
+    rand_marlin_weight_fp4_like,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+    marlin_quant_fp8_torch,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace,
+    awq_marlin_quantize,
+    marlin_quantize,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+    marlin_24_quantize,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    gptq_pack,
+    gptq_quantize_weights,
+    quantize_weights,
+    sort_weights,
+)
+from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils import FlexibleArgumentParser
+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
+ACT_ORDER_OPTS = [False, True]
+K_FULL_OPTS = [False, True]
+def bench_run(
+    results: list[benchmark.Measurement],
+    model: str,
+    act_order: bool,
+    is_k_full: bool,
+    quant_type: ScalarType,
+    group_size: int,
+    size_m: int,
+    size_k: int,
+    size_n: int,
+):
+    label = "Quant Matmul"
+    sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
+        model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
+    )
+    print(f"Testing: {sub_label}")
+    a = torch.randn(size_m, size_k).to(torch.half).cuda()
+    b = torch.rand(size_k, size_n).to(torch.half).cuda()
+    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+    if act_order and (group_size == -1 or group_size == size_k or has_zp):
+        return
+    if size_k % group_size != 0:
+        return
+    marlin_24_supported = (
+        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+        and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
+    )
+    repack_supported = (
+        quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
+        and group_size in MARLIN_SUPPORTED_GROUP_SIZES
+    )
+    allspark_supported = (
+        quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
+        and group_size == -1
+        and not act_order
+        and is_k_full
+    )
+    def gen_marlin_params():
+        # Marlin quant
+        marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
+        if quant_type == scalar_types.float4_e2m1f:
+            if group_size != 16 or act_order:
+                return
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
+                b.T, group_size
+            )
+        elif quant_type == scalar_types.float8_e4m3fn:
+            if group_size not in [-1, 128] or act_order:
+                return
+            marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
+        elif group_size == 16:
+            return
+        elif has_zp:
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
+                b, quant_type, group_size
+            )
+        else:
+            marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
+                marlin_quantize(b, quant_type, group_size, act_order)
+            )
+        return (
+            marlin_w_ref,
+            marlin_q_w,
+            marlin_s,
+            marlin_s2,
+            marlin_zp,
+            marlin_g_idx,
+            marlin_sort_indices,
+        )
+    def gen_marlin_24_params():
+        marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
+        if marlin_24_supported:
+            (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
+                marlin_24_quantize(b, quant_type, group_size)
+            )
+        return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
+    def gen_repack_params():
+        q_w_gptq = None
+        repack_sort_indices = None
+        if repack_supported:
+            (w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
+                b, quant_type, group_size, act_order
+            )
+            q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
+            # For act_order, sort the "weights" and "g_idx"
+            # so that group ids are increasing
+            repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
+            if act_order:
+                (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
+        return q_w_gptq, repack_sort_indices
+    def gen_allspark_params():
+        qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
+            CUBLAS_M_THRESHOLD
+        ) = None
+        nonlocal allspark_supported
+        if allspark_supported:
+            properties = torch.cuda.get_device_properties(b.device.index)
+            sm_count = properties.multi_processor_count
+            sm_version = properties.major * 10 + properties.minor
+            supported_arch = sm_version >= 80 and sm_version < 90
+            allspark_supported = allspark_supported and supported_arch
+            if supported_arch:
+                w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
+                qw = qw.to(torch.uint8)
+                qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
+                    qw, s, zp, has_zp
+                )
+                CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
+        return (
+            qw_reorder,
+            s_reorder,
+            zp_reorder,
+            sm_count,
+            sm_version,
+            CUBLAS_M_THRESHOLD,
+        )
+    (
+        marlin_w_ref,
+        marlin_q_w,
+        marlin_s,
+        marlin_s2,
+        marlin_zp,
+        marlin_g_idx,
+        marlin_sort_indices,
+    ) = gen_marlin_params()
+    marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
+        gen_marlin_24_params()
+    )
+    q_w_gptq, repack_sort_indices = gen_repack_params()
+    qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
+        gen_allspark_params()
+    )
+    # Prepare
+    marlin_workspace = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
+    )
+    marlin_24_workspace = MarlinWorkspace(
+        size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
+    )
+    globals = {
+        # Gen params
+        "quant_type": quant_type,
+        "group_size": group_size,
+        "size_m": size_m,
+        "size_n": size_n,
+        "size_k": size_k,
+        "a": a,
+        # Marlin params
+        "marlin_w_ref": marlin_w_ref,
+        "marlin_q_w": marlin_q_w,
+        "marlin_s": marlin_s,
+        "marlin_s2": marlin_s2,
+        "marlin_zp": marlin_zp,
+        "marlin_g_idx": marlin_g_idx,
+        "marlin_sort_indices": marlin_sort_indices,
+        "marlin_workspace": marlin_workspace,
+        "is_k_full": is_k_full,
+        # Marlin_24 params
+        "marlin_24_w_ref": marlin_24_w_ref,
+        "marlin_24_q_w_comp": marlin_24_q_w_comp,
+        "marlin_24_meta": marlin_24_meta,
+        "marlin_24_s": marlin_24_s,
+        "marlin_24_workspace": marlin_24_workspace,
+        # GPTQ params
+        "q_w_gptq": q_w_gptq,
+        "repack_sort_indices": repack_sort_indices,
+        # AllSpark W8A16 params
+        "qw_reorder": qw_reorder,
+        "s_reorder": s_reorder,
+        "zp_reorder": zp_reorder,
+        "sm_count": sm_count,
+        "sm_version": sm_version,
+        "CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
+        # Kernels
+        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
+        "gptq_marlin_repack": ops.gptq_marlin_repack,
+        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
+    }
+    min_run_time = 1
+    # Warmup pytorch
+    for _ in range(5):
+        torch.matmul(a, marlin_w_ref)
+    results.append(
+        benchmark.Timer(
+            stmt="torch.matmul(a, marlin_w_ref)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="pytorch_gemm",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+    results.append(
+        benchmark.Timer(
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_gemm",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+    results.append(
+        benchmark.Timer(
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="gptq_marlin_gemm_fp32",
+        ).blocked_autorange(min_run_time=min_run_time)
+    )
+    if marlin_24_supported:
+        results.append(
+            benchmark.Timer(
+                stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="gptq_marlin_24_gemm",
+            ).blocked_autorange(min_run_time=min_run_time)
+        )
+    if repack_supported:
+        results.append(
+            benchmark.Timer(
+                stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="gptq_marlin_repack",
+            ).blocked_autorange(min_run_time=min_run_time)
+        )
+    if allspark_supported:
+        results.append(
+            benchmark.Timer(
+                stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="allspark_w8a16_gemm_fp32",
+            ).blocked_autorange(min_run_time=min_run_time)
+        )
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+    results: list[benchmark.Measurement] = []
+    for model in args.models:
+        for layer in WEIGHT_SHAPES[model]:
+            size_k = layer[0]
+            size_n = layer[1]
+            if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                continue
+            if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                continue
+            for act_order in ACT_ORDER_OPTS:
+                if (
+                    len(args.limit_act_order) > 0
+                    and act_order not in args.limit_act_order
+                ):
+                    continue
+                for is_k_full in K_FULL_OPTS:
+                    if (
+                        len(args.limit_k_full) > 0
+                        and is_k_full not in args.limit_k_full
+                    ):
+                        continue
+                    for quant_type in query_marlin_supported_quant_types():
+                        if (
+                            len(args.limit_num_bits) > 0
+                            and quant_type.size_bits not in args.limit_num_bits
+                        ):
+                            continue
+                        for group_size in (
+                            MARLIN_SUPPORTED_GROUP_SIZES
+                            + FP4_MARLIN_SUPPORTED_GROUP_SIZES
+                        ):
+                            if (
+                                len(args.limit_group_size) > 0
+                                and group_size not in args.limit_group_size
+                            ):
+                                continue
+                            # For act_order, the group_size must be less than
+                            # size_k
+                            if act_order and (group_size == size_k or group_size == -1):
+                                continue
+                            for size_m in args.batch_sizes:
+                                bench_run(
+                                    results,
+                                    model,
+                                    act_order,
+                                    is_k_full,
+                                    quant_type,
+                                    group_size,
+                                    size_m,
+                                    size_k,
+                                    size_n,
+                                )
+    compare = benchmark.Compare(results)
+    compare.print()
+# For quick benchmarking use:
+#   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
+#
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches"
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    parser.add_argument(
+        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
+    )
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
+    args = parser.parse_args()
+    main(args)
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_moe.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_moe.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import json
+import time
+from contextlib import nullcontext
+from datetime import datetime
+from itertools import product
+from typing import Any, TypedDict
+import ray
+import torch
+from ray.experimental.tqdm_ray import tqdm
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
+from vllm.triton_utils import triton
+from vllm.utils import FlexibleArgumentParser
+FP8_DTYPE = current_platform.fp8_dtype()
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+def benchmark_config(
+    config: BenchmarkConfig,
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    block_quant_shape: list[int] = None,
+    use_deep_gemm: bool = False,
+) -> float:
+    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    if use_int8_w8a16:
+        w1 = torch.randint(
+            -127,
+            127,
+            (
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+            ),
+            dtype=torch.int8,
+        )
+        w2 = torch.randint(
+            -127,
+            127,
+            (
+                num_experts,
+                hidden_size,
+                shard_intermediate_size // 2,
+            ),
+            dtype=torch.int8,
+        )
+    else:
+        w1 = torch.randn(
+            num_experts, shard_intermediate_size, hidden_size, dtype=init_dtype
+        )
+        w2 = torch.randn(
+            num_experts, hidden_size, shard_intermediate_size // 2, dtype=init_dtype
+        )
+    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    if use_int8_w8a16:
+        w1_scale = torch.randn(
+            (num_experts, 2 * shard_intermediate_size), dtype=torch.float32
+        )
+        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
+    if use_fp8_w8a8:
+        if block_quant_shape:
+            block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+            E = num_experts
+            N = shard_intermediate_size // 2
+            K = hidden_size
+            factor_for_scale = 1e-2
+            n_tiles_w1 = (2 * N + block_n - 1) // block_n
+            n_tiles_w2 = (K + block_n - 1) // block_n
+            k_tiles_w1 = (K + block_k - 1) // block_k
+            k_tiles_w2 = (N + block_k - 1) // block_k
+            w1_scale = (
+                torch.rand((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+                * factor_for_scale
+            )
+            w2_scale = (
+                torch.rand((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+                * factor_for_scale
+            )
+        else:
+            w1_scale = torch.randn(num_experts, dtype=torch.float32)
+            w2_scale = torch.randn(num_experts, dtype=torch.float32)
+        a1_scale = torch.randn(1, dtype=torch.float32)
+        a2_scale = torch.randn(1, dtype=torch.float32)
+        w1 = w1.to(FP8_DTYPE)
+        w2 = w2.to(FP8_DTYPE)
+    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+    def run():
+        from vllm.model_executor.layers.fused_moe import override_config
+        with override_config(config):
+            if use_deep_gemm:
+                topk_weights, topk_ids, token_expert_indices = fused_topk(
+                    x, input_gating, topk, False
+                )
+                return fused_experts(
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                    allow_deep_gemm=True,
+                )
+            else:
+                fused_moe(
+                    x,
+                    w1,
+                    w2,
+                    input_gating,
+                    topk,
+                    renormalize=True,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    use_int8_w8a16=use_int8_w8a16,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                )
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    latencies: list[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+def get_rocm_tuning_space(use_fp16):
+    block_mn_range = [16, 32, 64, 128, 256]
+    block_k_range = [16, 32, 64, 128, 256]
+    if not use_fp16:
+        block_k_range.remove(16)  # BLOCK_K=16 not supported for fp8
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    num_stage_range = [2]
+    waves_per_eu_range = [0]
+    matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
+    kpack_range = [1, 2] if use_fp16 else []
+    param_ranges = {
+        "BLOCK_SIZE_M": block_mn_range,
+        "BLOCK_SIZE_N": block_mn_range,
+        "BLOCK_SIZE_K": block_k_range,
+        "GROUP_SIZE_M": group_m_range,
+        "num_warps": num_warps_range,
+        "num_stages": num_stage_range,
+        "waves_per_eu": waves_per_eu_range,
+    }
+    if use_fp16:
+        param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
+        param_ranges["kpack"] = kpack_range
+    return param_ranges
+def get_configs_compute_bound(use_fp16, block_quant_shape) -> list[dict[str, int]]:
+    configs: list[BenchmarkConfig] = []
+    if current_platform.is_rocm():
+        param_ranges = get_rocm_tuning_space(use_fp16)
+    else:
+        # Reduced search space for faster tuning.
+        # TODO(woosuk): Increase the search space and use a performance model to
+        # prune the search space.
+        block_m_range = [16, 32, 64, 128, 256]
+        block_n_range = [32, 64, 128, 256]
+        block_k_range = [64, 128, 256]
+        num_warps_range = [4, 8]
+        group_m_range = [1, 16, 32, 64]
+        num_stage_range = [2, 3, 4, 5]
+        param_ranges = {
+            "BLOCK_SIZE_M": block_m_range,
+            "BLOCK_SIZE_N": block_n_range,
+            "BLOCK_SIZE_K": block_k_range,
+            "GROUP_SIZE_M": group_m_range,
+            "num_warps": num_warps_range,
+            "num_stages": num_stage_range,
+        }
+    keys, values = zip(*param_ranges.items())
+    for config_values in product(*values):
+        config = dict(zip(keys, config_values))
+        configs.append(config)
+    # Remove configs that are not compatible with fp8 block quantization
+    # BLOCK_SIZE_K must be a multiple of block_k
+    # BLOCK_SIZE_N must be a multiple of block_n
+    if block_quant_shape is not None and not use_fp16:
+        block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+        for config in configs[:]:
+            if (
+                config["BLOCK_SIZE_K"] % block_k != 0
+                or config["BLOCK_SIZE_N"] % block_n != 0
+            ):
+                configs.remove(config)
+    return configs
+def prune_rocm_search_space(
+    num_tokens, shard_intermediate_size, hidden_size, search_space, is_fp16, topk
+):
+    N1, K1 = shard_intermediate_size, hidden_size
+    N2, K2 = hidden_size, shard_intermediate_size // 2
+    pruned_space_1 = prune_rocm_configs(
+        num_tokens * topk, N1, K1, search_space, is_fp16
+    )
+    pruned_space_2 = prune_rocm_configs(
+        num_tokens * topk, N2, K2, search_space, is_fp16
+    )
+    search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
+    return search_space
+# The following code is inspired by ROCm/Triton GEMM tuning script:
+# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
+def prune_rocm_configs(M, N, K, configs, is_fp16=True):
+    pruned_configs = []
+    elemBytes_a = 2 if is_fp16 else 1
+    elemBytes_b = 2 if is_fp16 else 1
+    mfma = 16 if M < 32 or N < 32 else 32
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+        if is_fp16:
+            matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+            if matrix_instr_nonkdim > mfma:
+                continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = config.get("SPLIT_K", 1)
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if is_fp16:
+            if (
+                matrix_instr_nonkdim > BLOCK_SIZE_M
+                or matrix_instr_nonkdim > BLOCK_SIZE_N
+            ):
+                continue
+            if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M:
+                continue
+            if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N:
+                continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (
+            BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a
+            + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b
+        )
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+        pruned_configs.append(config)
+    return pruned_configs
+def need_split_k(SIZE_M, SIZE_N, SIZE_K):
+    return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
+def merge_unique_dicts(list1, list2):
+    result = []
+    combined_list = list1.copy()
+    combined_list.extend(list2)
+    for dictionary in combined_list:
+        if dictionary not in result:
+            result.append(dictionary)
+    return result
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        current_platform.seed_everything(seed)
+        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        block_quant_shape: list[int] = None,
+        use_deep_gemm: bool = False,
+    ) -> tuple[dict[str, int], float]:
+        current_platform.seed_everything(self.seed)
+        dtype_str = get_config_dtype_str(
+            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+        )
+        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+        # is the intermediate size after silu_and_mul.
+        op_config = get_moe_configs(
+            num_experts, shard_intermediate_size // 2, dtype_str
+        )
+        if op_config is None:
+            config = get_default_config(
+                num_tokens,
+                num_experts,
+                shard_intermediate_size,
+                hidden_size,
+                topk,
+                dtype_str,
+                is_marlin=False,
+            )
+        else:
+            config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
+        kernel_time = benchmark_config(
+            config,
+            num_tokens,
+            num_experts,
+            shard_intermediate_size,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            block_quant_shape=block_quant_shape,
+            use_deep_gemm=use_deep_gemm,
+        )
+        return config, kernel_time
+    def tune(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        search_space: list[dict[str, int]],
+        block_quant_shape: list[int],
+        use_deep_gemm: bool,
+    ) -> dict[str, int]:
+        best_config = None
+        best_time = float("inf")
+        if current_platform.is_rocm():
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+            search_space = prune_rocm_search_space(
+                num_tokens,
+                shard_intermediate_size,
+                hidden_size,
+                search_space,
+                is_fp16,
+                topk,
+            )
+        need_device_guard = False
+        if current_platform.is_rocm():
+            visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+            if visible_device != f"{self.device_id}":
+                need_device_guard = True
+        with torch.cuda.device(self.device_id) if need_device_guard else nullcontext():
+            for config in tqdm(search_space):
+                try:
+                    kernel_time = benchmark_config(
+                        config,
+                        num_tokens,
+                        num_experts,
+                        shard_intermediate_size,
+                        hidden_size,
+                        topk,
+                        dtype,
+                        use_fp8_w8a8,
+                        use_int8_w8a16,
+                        num_iters=20,
+                        block_quant_shape=block_quant_shape,
+                        use_deep_gemm=use_deep_gemm,
+                    )
+                except triton.runtime.autotuner.OutOfResources:
+                    # Some configurations may be invalid and fail to compile.
+                    continue
+                if kernel_time < best_time:
+                    best_time = kernel_time
+                    best_config = config
+        now = datetime.now()
+        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
+        return best_config
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
+    return {
+        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
+        "num_warps": config["num_warps"],
+        "num_stages": config["num_stages"],
+        **(
+            {"waves_per_eu": config["waves_per_eu"]} if "waves_per_eu" in config else {}
+        ),
+        **(
+            {"matrix_instr_nonkdim": config["matrix_instr_nonkdim"]}
+            if "matrix_instr_nonkdim" in config
+            else {}
+        ),
+        **({"kpack": config["kpack"]} if "kpack" in config else {}),
+    }
+def save_configs(
+    configs: dict[int, BenchmarkConfig],
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    block_quant_shape: list[int],
+) -> None:
+    dtype_str = get_config_dtype_str(
+        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
+    )
+    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+    # is the intermediate size after silu_and_mul.
+    filename = get_config_file_name(
+        num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
+    )
+    print(f"Writing best config to {filename}...")
+    with open(filename, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+def get_weight_block_size_safety(config, default_value=None):
+    quantization_config = getattr(config, "quantization_config", {})
+    if isinstance(quantization_config, dict):
+        return quantization_config.get("weight_block_size", default_value)
+    return default_value
+def main(args: argparse.Namespace):
+    print(args)
+    config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
+    if args.model_prefix:
+        config = getattr(config, args.model_prefix)
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+        intermediate_size = config.ffn_config.ffn_hidden_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"):
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    else:
+        # Support for llama4
+        config = config.get_text_config()
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    hidden_size = config.hidden_size
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
+    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    block_quant_shape = get_weight_block_size_safety(config)
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = args.batch_size
+    use_deep_gemm = bool(args.use_deep_gemm)
+    if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
+        # Ray will set ROCR_VISIBLE_DEVICES for device visibility
+        logger.warning(
+            "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
+            "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES."
+        )
+        val = os.environ["HIP_VISIBLE_DEVICES"]
+        os.environ["ROCR_VISIBLE_DEVICES"] = val
+        del os.environ["HIP_VISIBLE_DEVICES"]
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+    if args.tune:
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
+        print(f"Start tuning over {len(search_space)} configurations...")
+        start = time.time()
+        configs = _distribute(
+            "tune",
+            [
+                (
+                    batch_size,
+                    E,
+                    shard_intermediate_size,
+                    hidden_size,
+                    topk,
+                    dtype,
+                    use_fp8_w8a8,
+                    use_int8_w8a16,
+                    search_space,
+                    block_quant_shape,
+                    use_deep_gemm,
+                )
+                for batch_size in batch_sizes
+            ],
+        )
+        best_configs = {
+            M: sort_config(config) for M, config in zip(batch_sizes, configs)
+        }
+        save_configs(
+            best_configs,
+            E,
+            shard_intermediate_size,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            block_quant_shape,
+        )
+        end = time.time()
+        print(f"Tuning took {end - start:.2f} seconds")
+    else:
+        outputs = _distribute(
+            "benchmark",
+            [
+                (
+                    batch_size,
+                    E,
+                    shard_intermediate_size,
+                    hidden_size,
+                    topk,
+                    dtype,
+                    use_fp8_w8a8,
+                    use_int8_w8a16,
+                    block_quant_shape,
+                    use_deep_gemm,
+                )
+                for batch_size in batch_sizes
+            ],
+        )
+        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
+            print(f"Batch size: {batch_size}, config: {config}")
+            print(f"Kernel time: {kernel_time:.2f} us")
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument(
+        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
+    )
+    parser.add_argument(
+        "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
+    )
+    parser.add_argument("--use-deep-gemm", action="store_true")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, nargs="+", required=False)
+    parser.add_argument("--tune", action="store_true")
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--model-prefix", type=str, required=False)
+    args = parser.parse_args()
+    main(args)
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_moe_align_block_size.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_moe_align_block_size.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import itertools
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size_triton,
+)
+from vllm.triton_utils import triton
+def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
+    return torch.stack(
+        [
+            torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
+            for _ in range(num_tokens)
+        ]
+    )
+def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
+    """
+    Verifies vllm vs. Triton
+    """
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+    # 1. malloc space for triton and vllm
+    # malloc enough space (max_num_tokens_padded) for the sorted ids
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids_triton = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device="cuda"
+    )
+    sorted_ids_triton.fill_(topk_ids.numel())  # fill with sentinel value
+    expert_ids_triton = torch.zeros(
+        (max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
+    )
+    num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
+    sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
+    sorted_ids_vllm.fill_(topk_ids.numel())
+    expert_ids_vllm = torch.zeros_like(expert_ids_triton)
+    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
+    # 2. run implementations
+    moe_align_block_size_triton(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_triton,
+        expert_ids_triton,
+        num_tokens_post_pad_triton,
+    )
+    ops.moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids_vllm,
+        expert_ids_vllm,
+        num_tokens_post_pad_vllm,
+    )
+    print(f"✅ VLLM implementation works with {num_experts} experts!")
+    # 3. compare results
+    if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
+        num_tokens_post_pad_triton, num_tokens_post_pad_vllm
+    ):
+        print("✅ Triton and VLLM implementations match.")
+    else:
+        print("❌ Triton and VLLM implementations DO NOT match.")
+        print("Triton expert_ids:", expert_ids_triton)
+        print("VLLM expert_ids:", expert_ids_vllm)
+        print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
+        print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
+# test configurations
+num_tokens_range = [1, 16, 256, 4096]
+num_experts_range = [16, 64, 224, 256, 280, 512]
+topk_range = [1, 2, 8]
+configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens", "num_experts", "topk"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=["vllm", "triton"],  # "triton"
+        line_names=["VLLM", "Triton"],  # "Triton"
+        plot_name="moe-align-block-size-performance",
+        args={},
+    )
+)
+def benchmark(num_tokens, num_experts, topk, provider):
+    """Benchmark function for Triton."""
+    block_size = 256
+    topk_ids = get_topk_ids(num_tokens, num_experts, topk)
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
+    num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "vllm":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: ops.moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+    elif provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids.clone(),
+                expert_ids.clone(),
+                num_tokens_post_pad.clone(),
+            ),
+            quantiles=quantiles,
+        )
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num_experts",
+        type=int,
+        default=64,
+        choices=[8, 16, 32, 64, 128, 256],
+    )
+    parser.add_argument(
+        "--topk",
+        type=int,
+        default=8,
+        choices=[2, 4, 8],
+        help="Top-k value for correctness check.",
+    )
+    args = parser.parse_args()
+    print("Running correctness check...")
+    check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
+    benchmark.run(print_data=True, show_plots=True)
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+from typing import Any, TypedDict
+import ray
+import torch
+from transformers import AutoConfig
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _moe_permute,
+    _moe_unpermute_and_reduce,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import *
+from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+FP8_DTYPE = current_platform.fp8_dtype()
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+def benchmark_permute(
+    num_tokens: int,
+    num_experts: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    use_customized_permute: bool = False,
+) -> float:
+    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    # output_hidden_states = torch.empty_like(hidden_states)
+    if use_fp8_w8a8:
+        align_block_size = 128  # deepgemm needs 128 m aligned block
+        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
+    else:
+        align_block_size = None
+        qhidden_states = hidden_states
+    gating_output = torch.randn(num_iters, num_tokens, num_experts, dtype=torch.float32)
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        qhidden_states, input_gating, topk, False
+    )
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+    def run():
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
+                moe_permute(
+                    qhidden_states,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    token_expert_indices=token_expert_indices,
+                    topk=topk,
+                    n_expert=num_experts,
+                    n_local_expert=num_experts,
+                    expert_map=None,
+                    align_block_size=align_block_size,
+                )
+            )
+        else:
+            (
+                permuted_hidden_states,
+                a1q_scale,
+                sorted_token_ids,
+                expert_ids,
+                inv_perm,
+            ) = _moe_permute(
+                qhidden_states, None, topk_ids, num_experts, None, align_block_size
+            )
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    latencies: list[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+def benchmark_unpermute(
+    num_tokens: int,
+    num_experts: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    use_customized_permute: bool = False,
+) -> float:
+    # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    output_hidden_states = torch.empty_like(hidden_states)
+    if use_fp8_w8a8:
+        align_block_size = 128  # deepgemm needs 128 m aligned block
+        qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
+    else:
+        align_block_size = None
+        qhidden_states = hidden_states
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+    topk_weights, topk_ids, token_expert_indices = fused_topk(
+        qhidden_states, input_gating, topk, False
+    )
+    def prepare():
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = (
+                moe_permute(
+                    qhidden_states,
+                    topk_weights=topk_weights,
+                    topk_ids=topk_ids,
+                    token_expert_indices=token_expert_indices,
+                    topk=topk,
+                    n_expert=num_experts,
+                    n_local_expert=num_experts,
+                    expert_map=None,
+                    align_block_size=align_block_size,
+                )
+            )
+            # convert to fp16/bf16 as gemm output
+            return (
+                permuted_hidden_states.to(dtype),
+                first_token_off,
+                inv_perm_idx,
+                m_indices,
+            )
+        else:
+            (
+                permuted_qhidden_states,
+                a1q_scale,
+                sorted_token_ids,
+                expert_ids,
+                inv_perm,
+            ) = _moe_permute(
+                qhidden_states, None, topk_ids, num_experts, None, align_block_size
+            )
+            # convert to fp16/bf16 as gemm output
+            return (
+                permuted_qhidden_states.to(dtype),
+                a1q_scale,
+                sorted_token_ids,
+                expert_ids,
+                inv_perm,
+            )
+    def run(input: tuple):
+        if use_customized_permute:
+            (permuted_hidden_states, first_token_off, inv_perm_idx, m_indices) = input
+            moe_unpermute(
+                permuted_hidden_states,
+                topk_weights,
+                topk_ids,
+                inv_perm_idx,
+                first_token_off,
+                topk,
+                num_experts,
+                num_experts,
+            )
+        else:
+            (
+                permuted_hidden_states,
+                a1q_scale,
+                sorted_token_ids,
+                expert_ids,
+                inv_perm,
+            ) = input
+            _moe_unpermute_and_reduce(
+                output_hidden_states, permuted_hidden_states, inv_perm, topk_weights
+            )
+    # JIT compilation & warmup
+    input = prepare()
+    run(input)
+    torch.cuda.synchronize()
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run(input)
+    torch.cuda.synchronize()
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        current_platform.seed_everything(seed)
+        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        use_customized_permute: bool = False,
+    ) -> tuple[dict[str, int], float]:
+        current_platform.seed_everything(self.seed)
+        permute_time = benchmark_permute(
+            num_tokens,
+            num_experts,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            use_customized_permute=use_customized_permute,
+        )
+        unpermute_time = benchmark_unpermute(
+            num_tokens,
+            num_experts,
+            hidden_size,
+            topk,
+            dtype,
+            use_fp8_w8a8,
+            use_int8_w8a16,
+            num_iters=100,
+            use_customized_permute=use_customized_permute,
+        )
+        return permute_time, unpermute_time
+def get_weight_block_size_safety(config, default_value=None):
+    quantization_config = getattr(config, "quantization_config", {})
+    if isinstance(quantization_config, dict):
+        return quantization_config.get("weight_block_size", default_value)
+    return default_value
+def main(args: argparse.Namespace):
+    print(args)
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code
+    )
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+    elif (
+        config.architectures[0] == "DeepseekV3ForCausalLM"
+        or config.architectures[0] == "DeepseekV2ForCausalLM"
+    ):
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+    elif config.architectures[0] in ["Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"]:
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+    else:
+        # Support for llama4
+        config = config.get_text_config()
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+    hidden_size = config.hidden_size
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
+    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    use_customized_permute = args.use_customized_permute
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+    outputs = _distribute(
+        "benchmark",
+        [
+            (
+                batch_size,
+                E,
+                hidden_size,
+                topk,
+                dtype,
+                use_fp8_w8a8,
+                use_int8_w8a16,
+                use_customized_permute,
+            )
+            for batch_size in batch_sizes
+        ],
+    )
+    for batch_size, (permute, unpermute) in zip(batch_sizes, outputs):
+        print(f"Batch size: {batch_size}")
+        print(f"Permute time: {permute:.2f} us")
+        print(f"Unpermute time: {unpermute:.2f} us")
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument(
+        "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
+    )
+    parser.add_argument("--use-customized-permute", action="store_true")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    args = parser.parse_args()
+    main(args)
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_paged_attention.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_paged_attention.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import time
+from typing import Optional
+import torch
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import (
+    STR_DTYPE_TO_TORCH_DTYPE,
+    FlexibleArgumentParser,
+    create_kv_caches_with_random,
+)
+logger = init_logger(__name__)
+NUM_BLOCKS = 128 * 1024
+PARTITION_SIZE = 512
+PARTITION_SIZE_ROCM = 256
+@torch.inference_mode()
+def main(
+    version: str,
+    num_seqs: int,
+    seq_len: int,
+    num_query_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    use_alibi: bool,
+    block_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    do_profile: bool,
+    device: str = "cuda",
+    kv_cache_dtype: Optional[str] = None,
+) -> None:
+    current_platform.seed_everything(seed)
+    scale = float(1.0 / (head_size**0.5))
+    query = torch.empty(
+        num_seqs, num_query_heads, head_size, dtype=dtype, device=device
+    )
+    query.uniform_(-scale, scale)
+    assert num_query_heads % num_kv_heads == 0
+    alibi_slopes = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float, device=device)
+    seq_lens = [seq_len for _ in range(num_seqs)]
+    max_seq_len = max(seq_lens)
+    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
+    # Create the block tables.
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
+    block_tables_lst: list[list[int]] = []
+    for _ in range(num_seqs):
+        block_table = [
+            random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq)
+        ]
+        block_tables_lst.append(block_table)
+    block_tables = torch.tensor(block_tables_lst, dtype=torch.int, device=device)
+    # Create the KV cache.
+    key_caches, value_caches = create_kv_caches_with_random(
+        NUM_BLOCKS,
+        block_size,
+        1,
+        num_kv_heads,
+        head_size,
+        kv_cache_dtype,
+        dtype,
+        device=device,
+    )
+    key_cache, value_cache = key_caches[0], value_caches[0]
+    # Prepare for the paged attention kernel.
+    output = torch.empty_like(query)
+    if version == "v2":
+        if current_platform.is_rocm():
+            global PARTITION_SIZE
+            if not args.custom_paged_attn and not current_platform.is_navi():
+                PARTITION_SIZE = 1024
+            else:
+                PARTITION_SIZE = PARTITION_SIZE_ROCM
+        num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
+        tmp_output = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions, head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_query_heads, num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+        # Using default kv_scale
+        k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
+        for _ in range(num_iters):
+            if version == "v1":
+                ops.paged_attention_v1(
+                    output,
+                    query,
+                    key_cache,
+                    value_cache,
+                    num_kv_heads,
+                    scale,
+                    block_tables,
+                    seq_lens,
+                    block_size,
+                    max_seq_len,
+                    alibi_slopes,
+                    kv_cache_dtype,
+                    k_scale,
+                    v_scale,
+                )
+            elif version == "v2":
+                if not args.custom_paged_attn:
+                    ops.paged_attention_v2(
+                        output,
+                        exp_sums,
+                        max_logits,
+                        tmp_output,
+                        query,
+                        key_cache,
+                        value_cache,
+                        num_kv_heads,
+                        scale,
+                        block_tables,
+                        seq_lens,
+                        block_size,
+                        max_seq_len,
+                        alibi_slopes,
+                        kv_cache_dtype,
+                        k_scale,
+                        v_scale,
+                    )
+                else:
+                    ops.paged_attention_rocm(
+                        output,
+                        exp_sums,
+                        max_logits,
+                        tmp_output,
+                        query,
+                        key_cache,
+                        value_cache,
+                        num_kv_heads,
+                        scale,
+                        block_tables,
+                        seq_lens,
+                        None,
+                        block_size,
+                        max_seq_len,
+                        alibi_slopes,
+                        kv_cache_dtype,
+                        k_scale,
+                        v_scale,
+                    )
+            else:
+                raise ValueError(f"Invalid version: {version}")
+        torch.cuda.synchronize()
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStop()
+        return (end_time - start_time) / num_iters
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=3, profile=False)
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=100, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+if __name__ == "__main__":
+    logger.warning(
+        "This script benchmarks the paged attention kernel. "
+        "By default this is no longer used in vLLM inference."
+    )
+    parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.")
+    parser.add_argument("--version", type=str, choices=["v1", "v2"], default="v2")
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--seq-len", type=int, default=4096)
+    parser.add_argument("--num-query-heads", type=int, default=64)
+    parser.add_argument("--num-kv-heads", type=int, default=8)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
+    parser.add_argument("--use-alibi", action="store_true")
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
+        default="auto",
+        help="Data type for kv cache storage. If 'auto', will use model "
+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)",
+    )
+    parser.add_argument(
+        "--custom-paged-attn", action="store_true", help="Use custom paged attention"
+    )
+    args = parser.parse_args()
+    print(args)
+    if args.num_query_heads % args.num_kv_heads != 0:
+        raise ValueError("num_query_heads must be divisible by num_kv_heads")
+    main(
+        version=args.version,
+        num_seqs=args.batch_size,
+        seq_len=args.seq_len,
+        num_query_heads=args.num_query_heads,
+        num_kv_heads=args.num_kv_heads,
+        head_size=args.head_size,
+        block_size=args.block_size,
+        use_alibi=args.use_alibi,
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        seed=args.seed,
+        do_profile=args.profile,
+        kv_cache_dtype=args.kv_cache_dtype,
+    )
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_quant.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_quant.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+import torch
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+@torch.inference_mode()
+def main(
+    num_tokens: int,
+    hidden_size: int,
+    static_scale: bool,
+    quant_dtype: torch.dtype,
+    dtype: torch.dtype,
+    seed: int = 0,
+    do_profile: bool = False,
+    num_warmup_iters: int = 5,
+    num_iters: int = 100,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda")
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+        for _ in range(num_iters):
+            if quant_dtype == torch.int8:
+                ops.scaled_int8_quant(x, scale)
+            else:
+                ops.scaled_fp8_quant(x, scale)
+        torch.cuda.synchronize()
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStop()
+        return (end_time - start_time) / num_iters
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=num_warmup_iters, profile=False)
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=num_iters, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+if __name__ == "__main__":
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError(f"Unsupported dtype: {dt}")
+    parser = FlexibleArgumentParser(
+        description="Benchmark the quantization (fp8 or int8) kernel."
+    )
+    parser.add_argument("--num-tokens", type=int, default=4096)
+    parser.add_argument("--hidden-size", type=int, default=8192)
+    parser.add_argument("--static-scale", action="store_true")
+    parser.add_argument(
+        "--quant-dtype", type=str, choices=["fp8", "int8"], default="int8"
+    )
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="half"
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--num-warmup-iters", type=int, default=5)
+    parser.add_argument(
+        "--num-iters",
+        type=int,
+        default=100,
+        help="Number of benchmark iterations. "
+        "If --profile is set, this number is ignored",
+    )
+    args = parser.parse_args()
+    print(args)
+    main(
+        num_tokens=args.num_tokens,
+        hidden_size=args.hidden_size,
+        static_scale=args.static_scale,
+        quant_dtype=to_torch_dtype(args.quant_dtype),
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        seed=args.seed,
+        do_profile=args.profile,
+        num_warmup_iters=args.num_warmup_iters,
+        num_iters=args.num_iters,
+    )
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_rmsnorm.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+from typing import Optional, Union
+import torch
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+from vllm import _custom_ops as vllm_ops
+from vllm.triton_utils import triton
+class HuggingFaceRMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+def rmsnorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+    naive_norm.weight = nn.Parameter(weight)
+    naive_norm = naive_norm.to(x.device)
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+    output = naive_norm(x, residual)
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+def rmsnorm_flashinfer(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+    if residual is not None:
+        fused_add_rmsnorm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        output = rmsnorm(x, weight, eps)
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+def rmsnorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda")
+    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x) if use_residual else None
+    output_naive = rmsnorm_naive(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+    output_flashinfer = rmsnorm_flashinfer(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+    output_vllm = rmsnorm_vllm(
+        x.clone(), weight, residual.clone() if residual is not None else None
+    )
+    if use_residual:
+        output_naive = output_naive[0]
+        output_flashinfer = output_flashinfer[0]
+        output_vllm = output_vllm[0]
+    print(f"Naive output={output_naive}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"vLLM output={output_vllm}")
+    if torch.allclose(
+        output_naive, output_flashinfer, atol=1e-2, rtol=1e-2
+    ) and torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(itertools.product(head_num_range, batch_size_range, seq_length_range))
+def get_benchmark(use_residual):
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["head_num", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["huggingface", "flashinfer", "vllm"],
+            line_names=["HuggingFace", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
+            args={},
+        )
+    )
+    def benchmark(head_num, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_size = head_num * 128  # assuming head_dim = 128
+        x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device="cuda")
+        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+        residual = torch.randn_like(x) if use_residual else None
+        quantiles = [0.5, 0.2, 0.8]
+        if provider == "huggingface":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_naive(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_flashinfer(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_vllm(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+    return benchmark
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=4096,
+        help="Hidden size (2nd dimension) of the sequence",
+    )
+    parser.add_argument(
+        "--use-residual", action="store_true", help="Whether to use residual connection"
+    )
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/rmsnorm/",
+        help="Path to save rmsnorm benchmark results",
+    )
+    args = parser.parse_args()
+    # Run correctness test
+    calculate_diff(
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        hidden_size=args.hidden_size,
+        use_residual=args.use_residual,
+    )
+    # Get the benchmark function with proper use_residual setting
+    benchmark = get_benchmark(args.use_residual)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_rope.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_rope.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from itertools import accumulate
+from typing import Optional
+import nvtx
+import torch
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+def benchmark_rope_kernels_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: float = 10000,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    # silulating serving 4 LoRAs
+    scaling_factors = [1, 2, 4, 8]
+    # batched RoPE can take multiple scaling factors
+    batched_rope = get_rope(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        {"rope_type": "linear", "factor": tuple(scaling_factors)},
+    )
+    # non-batched RoPE takes only one scaling factor, we create multiple
+    # instances to simulate the same behavior
+    non_batched_ropes: list[RotaryEmbedding] = []
+    for scaling_factor in scaling_factors:
+        non_batched_ropes.append(
+            get_rope(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                {"rope_type": "linear", "factor": (scaling_factor,)},
+            )
+        )
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size, seq_len, num_heads * head_size, dtype=dtype)
+    key = torch.randn_like(query)
+    # create query offsets for batched RoPE, we concat multiple kv cache
+    # together and each query needs to find the right kv cache of its type
+    offset_map = torch.tensor(
+        list(
+            accumulate(
+                [0]
+                + [
+                    max_position * scaling_factor * 2
+                    for scaling_factor in scaling_factors[:-1]
+                ]
+            )
+        )
+    )
+    query_types = torch.randint(
+        0, len(scaling_factors), (batch_size, seq_len), device=device
+    )
+    # map query types to offsets
+    query_offsets = offset_map[query_types]
+    # the kernel takes flattened offsets
+    flatten_offsets = query_offsets.flatten()
+    # batched queries of the same type together for non-batched RoPE
+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
+    packed_qkr = zip(queries, keys, non_batched_ropes)
+    # synchronize before start timing
+    torch.cuda.synchronize()
+    with nvtx.annotate("non-batched", color="yellow"):
+        for q, k, r in packed_qkr:
+            r.forward(positions, q, k)
+    torch.cuda.synchronize()
+    with nvtx.annotate("batched", color="green"):
+        batched_rope.forward(positions, query, key, flatten_offsets)
+    torch.cuda.synchronize()
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the rotary embedding kernels."
+    )
+    parser.add_argument("--is-neox-style", type=bool, default=True)
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=[64, 80, 96, 112, 120, 128, 192, 256],
+        default=128,
+    )
+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+    parser.add_argument(
+        "--dtype", type=str, choices=["bfloat16", "float"], default="float"
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--device", type=str, choices=["cuda:0", "cuda:1"], default="cuda:0"
+    )
+    args = parser.parse_args()
+    print(args)
+    benchmark_rope_kernels_multi_lora(
+        is_neox_style=args.is_neox_style,
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        num_heads=args.num_heads,
+        head_size=args.head_size,
+        rotary_dim=args.rotary_dim,
+        dtype=getattr(torch, args.dtype),
+        seed=args.seed,
+        device=args.device,
+    )
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_shapes.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_shapes.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+WEIGHT_SHAPES = {
+    "ideal": [[4 * 256 * 32, 256 * 32]],
+    "mistralai/Mistral-7B-v0.1/TP1": [
+        [4096, 6144],
+        [4096, 4096],
+        [4096, 28672],
+        [14336, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP2": [
+        [4096, 3072],
+        [2048, 4096],
+        [4096, 14336],
+        [7168, 4096],
+    ],
+    "mistralai/Mistral-7B-v0.1/TP4": [
+        [4096, 1536],
+        [1024, 4096],
+        [4096, 7168],
+        [3584, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP1": [
+        [4096, 12288],
+        [4096, 4096],
+        [4096, 22016],
+        [11008, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP2": [
+        [4096, 6144],
+        [2048, 4096],
+        [4096, 11008],
+        [5504, 4096],
+    ],
+    "meta-llama/Llama-2-7b-hf/TP4": [
+        [4096, 3072],
+        [1024, 4096],
+        [4096, 5504],
+        [2752, 4096],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP1": [
+        [5120, 15360],
+        [5120, 5120],
+        [5120, 27648],
+        [13824, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP2": [
+        [5120, 7680],
+        [2560, 5120],
+        [5120, 13824],
+        [6912, 5120],
+    ],
+    "meta-llama/Llama-2-13b-hf/TP4": [
+        [5120, 3840],
+        [1280, 5120],
+        [5120, 6912],
+        [3456, 5120],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP1": [
+        [8192, 10240],
+        [8192, 8192],
+        [8192, 57344],
+        [28672, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP2": [
+        [8192, 5120],
+        [4096, 8192],
+        [8192, 28672],
+        [14336, 8192],
+    ],
+    "meta-llama/Llama-2-70b-hf/TP4": [
+        [8192, 2560],
+        [2048, 8192],
+        [8192, 14336],
+        [7168, 8192],
+    ],
+}
+WEIGHT_SHAPES_MOE = {
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
+        [8, 2, 4096, 28672],
+        [8, 2, 14336, 4096],
+    ],
+    "nm-testing/deepseekv2-lite": [
+        [64, 6, 2048, 1408],
+    ],
+    "ibm-granite/granite-3.0-1b-a400m": [
+        [32, 8, 1024, 1024],
+    ],
+    "ibm-granite/granite-3.0-3b-a800m": [
+        [40, 8, 1024, 1536],
+    ],
+}
--- a/offline_benchmark_test/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/offline_benchmark_test/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from sglang quantization/tuning_block_wise_kernel.py
+import argparse
+import json
+import multiprocessing as mp
+import os
+import time
+from datetime import datetime
+from typing import Any
+import torch
+import tqdm
+import triton
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+mp.set_start_method("spawn", force=True)
+assert current_platform.is_cuda(), (
+    "Only support tune w8a8 block fp8 kernel on CUDA device."
+)
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+}
+def w8a8_block_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    config: dict[str, Any],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with
+    block-wise quantization.
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization.
+                    It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+    def grid(META):
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+    if A.dtype == torch.float8_e4m3fn:
+        kernel = _w8a8_block_fp8_matmul
+    else:
+        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
+    kernel[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+    return C
+def get_configs_compute_bound():
+    configs = []
+    for num_stages in [2, 3, 4, 5]:
+        for block_m in [16, 32, 64, 128, 256]:
+            for block_k in [64, 128]:
+                for block_n in [32, 64, 128, 256]:
+                    for num_warps in [4, 8]:
+                        for group_size in [1, 16, 32, 64]:
+                            configs.append(
+                                {
+                                    "BLOCK_SIZE_M": block_m,
+                                    "BLOCK_SIZE_N": block_n,
+                                    "BLOCK_SIZE_K": block_k,
+                                    "GROUP_SIZE_M": group_size,
+                                    "num_warps": num_warps,
+                                    "num_stages": num_stages,
+                                }
+                            )
+    return configs
+def get_weight_shapes(tp_size):
+    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
+    # Modify them, if you tune for another different model.
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (12288, 7168),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+    weight_shapes = []
+    for t in total:
+        weight_shapes.append(t)
+    for n_t in n_tp:
+        new_t = (n_t[0] // tp_size, n_t[1])
+        weight_shapes.append(new_t)
+    for k_t in k_tp:
+        new_t = (k_t[0], k_t[1] // tp_size)
+        weight_shapes.append(new_t)
+    return weight_shapes
+def benchmark_config(
+    A, B, As, Bs, block_size, config, out_dtype=torch.float16, num_iters=10
+):
+    def run():
+        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
+    torch.cuda.synchronize()
+    # JIT complication & warmup
+    for _ in range(5):
+        run()
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        run()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    return avg
+def tune(M, N, K, block_size, out_dtype, search_space, input_type):
+    factor_for_scale = 1e-2
+    if input_type == "fp8":
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+        A_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+        )
+        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+        B_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 * fp8_max
+        )
+        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    else:
+        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device="cuda") * factor_for_scale
+    Bs = (
+        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda")
+        * factor_for_scale
+    )
+    best_config = None
+    best_time = float("inf")
+    for config in tqdm(search_space):
+        try:
+            kernel_time = benchmark_config(
+                A,
+                B,
+                As,
+                Bs,
+                block_size,
+                config,
+                out_dtype,
+                num_iters=10,
+            )
+        except triton.runtime.autotuner.OutOfResources:
+            # Some configurations may be invalid and fail to compile.
+            continue
+        if kernel_time < best_time:
+            best_time = kernel_time
+            best_config = config
+    now = datetime.now()
+    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
+    assert best_config is not None
+    return best_config
+def save_configs(
+    N,
+    K,
+    block_n,
+    block_k,
+    configs,
+    save_path,
+    input_type="fp8",
+) -> None:
+    os.makedirs(save_path, exist_ok=True)
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = (
+        f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
+        f"block_shape=[{block_n},{block_k}].json"
+    )
+    config_file_path = os.path.join(save_path, json_file_name)
+    print(f"Writing best config to {config_file_path}...")
+    with open(config_file_path, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+def tune_on_gpu(args_dict):
+    """Run tuning on a specific GPU."""
+    gpu_id = args_dict["gpu_id"]
+    batch_sizes = args_dict["batch_sizes"]
+    weight_shapes = args_dict["weight_shapes"]
+    args = args_dict["args"]
+    torch.cuda.set_device(gpu_id)
+    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
+    block_n = args.block_n
+    block_k = args.block_k
+    out_dtype = DTYPE_MAP[args.out_dtype]
+    save_path = args.save_path
+    input_type = args.input_type
+    search_space = get_configs_compute_bound()
+    search_space = [
+        config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
+    ]
+    start = time.time()
+    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
+        N, K = shape[0], shape[1]
+        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
+        benchmark_results = [
+            tune(
+                batch_size,
+                N,
+                K,
+                [block_n, block_k],
+                out_dtype,
+                search_space,
+                input_type,
+            )
+            for batch_size in tqdm(batch_sizes, desc=f"GPU {gpu_id} - Batch sizes")
+        ]
+        best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
+        save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
+    end = time.time()
+    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
+def distribute_batch_sizes(batch_sizes, num_gpus):
+    """Distribute batch sizes across available GPUs."""
+    batches_per_gpu = []
+    for i in range(num_gpus):
+        start_idx = i * len(batch_sizes) // num_gpus
+        end_idx = (i + 1) * len(batch_sizes) // num_gpus
+        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
+    return batches_per_gpu
+def main(args):
+    print(args)
+    num_gpus = torch.cuda.device_count()
+    if num_gpus == 0:
+        raise RuntimeError("No GPU available for tuning")
+    print(f"Found {num_gpus} GPUs for parallel tuning")
+    torch.cuda.init()
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+        num_gpus = 1  # If only one batch size, use only one GPU
+    weight_shapes = get_weight_shapes(args.tp_size)
+    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
+    process_args = []
+    for gpu_id in range(num_gpus):
+        process_args.append(
+            {
+                "gpu_id": gpu_id,
+                "batch_sizes": batches_per_gpu[gpu_id],
+                "weight_shapes": weight_shapes,  # Each GPU processes all weight shapes
+                "args": args,
+            }
+        )
+    ctx = mp.get_context("spawn")
+    with ctx.Pool(num_gpus) as pool:
+        pool.map(tune_on_gpu, process_args)
+    print("Multi-GPU tuning completed")
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="""
+Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
+    python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
+Then copy to model_executor/layers/quantization/utils/configs
+        """,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument("--tp-size", "-tp", type=int, default=8)
+    parser.add_argument("--input-type", type=str, choices=["fp8"], default="fp8")
+    parser.add_argument(
+        "--out-dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16", "half"],
+        default="float16",
+    )
+    parser.add_argument("--block-n", type=int, default=128)
+    parser.add_argument("--block-k", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--save-path", type=str, default="./")
+    args = parser.parse_args()
+    main(args)
--- a/offline_benchmark_test/benchmarks/kernels/deepgemm/README.md
+++ b/offline_benchmark_test/benchmarks/kernels/deepgemm/README.md
+# DeepSeek DeepGEMM Kernels Benchmark
+This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
+Currently this just includes dense GEMMs and only works on Hopper GPUs.
+## Setup
+You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
+```
+git clone --recursive https://github.com/deepseek-ai/DeepGEMM
+cd DeepGEMM
+python setup.py install
+uv pip install -e .
+```
+## Usage
+```
+python benchmark_fp8_block_dense_gemm.py
+INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
+===== STARTING FP8 GEMM BENCHMARK =====
+PyTorch version: 2.5.1+cu124
+CUDA version: 12.4
+Triton version: 3.1.0
+Using device: NVIDIA H100 80GB HBM3
+WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+===== PERFORMANCE COMPARISON =====
+DeepGEMM Implementation:
+------+-------+-------+-----------+--------+--------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   |
+------+-------+-------+-----------+--------+--------+
+|    8 |  4096 |  7168 | 102.9     | 4.6    | 286.4  |
+|    8 |  7168 | 18432 | 70.8      | 29.8   | 1868.8 |
+|    8 | 18432 |  7168 | 69.3      | 30.5   | 1911.8 |
+|   64 |  4096 |  7168 | 69.1      | 54.4   | 439.0  |
+|   64 |  7168 | 18432 | 69.4      | 243.6  | 1933.6 |
+|   64 | 18432 |  7168 | 70.4      | 240.3  | 1917.2 |
+|   64 | 24576 |  1536 | 70.1      | 68.9   | 584.6  |
+|   64 | 32768 |   512 | 68.4      | 31.4   | 307.1  |
+|   64 |  7168 | 16384 | 69.5      | 216.3  | 1718.5 |
+|  128 |  4096 |  7168 | 141.1     | 53.3   | 222.1  |
+|  128 |  7168 | 18432 | 71.9      | 470.5  | 1896.1 |
+|  128 | 18432 |  7168 | 69.3      | 488.2  | 1988.2 |
+| 1024 |  4096 |  7168 | 89.7      | 670.1  | 502.5  |
+| 1024 | 18432 |  7168 | 279.0     | 969.8  | 635.2  |
+| 2048 |  4096 |  7168 | 175.1     | 687.0  | 347.4  |
+| 4096 |  4096 |  7168 | 335.4     | 717.0  | 275.1  |
+------+-------+-------+-----------+--------+--------+
+vLLM Triton Implementation:
+------+-------+-------+-----------+--------+--------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  |
+------+-------+-------+-----------+--------+--------+--------------+
+|    8 |  4096 |  7168 | 74.0      | 6.3    | 398.2  | 1.39x faster |
+|    8 |  7168 | 18432 | 89.6      | 23.6   | 1478.1 | 0.79x slower |
+|    8 | 18432 |  7168 | 113.2     | 18.7   | 1170.4 | 0.61x slower |
+|   64 |  4096 |  7168 | 79.4      | 47.3   | 382.2  | 0.87x slower |
+|   64 |  7168 | 18432 | 98.5      | 171.7  | 1363.0 | 0.70x slower |
+|   64 | 18432 |  7168 | 119.5     | 141.5  | 1129.4 | 0.59x slower |
+|   64 | 24576 |  1536 | 37.6      | 128.4  | 1089.7 | 1.86x faster |
+|   64 | 32768 |   512 | 38.7      | 55.5   | 542.6  | 1.77x faster |
+|   64 |  7168 | 16384 | 86.1      | 174.5  | 1386.4 | 0.81x slower |
+|  128 |  4096 |  7168 | 90.7      | 82.9   | 345.4  | 1.56x faster |
+|  128 |  7168 | 18432 | 144.0     | 234.9  | 946.9  | 0.50x slower |
+|  128 | 18432 |  7168 | 229.5     | 147.4  | 600.1  | 0.30x slower |
+| 1024 |  4096 |  7168 | 242.3     | 248.2  | 186.1  | 0.37x slower |
+| 1024 | 18432 |  7168 | 897.8     | 301.4  | 197.4  | 0.31x slower |
+| 2048 |  4096 |  7168 | 463.0     | 259.7  | 131.4  | 0.38x slower |
+| 4096 |  4096 |  7168 | 901.8     | 266.7  | 102.3  | 0.37x slower |
+------+-------+-------+-----------+--------+--------+--------------+
+vLLM CUTLASS Implementation:
+------+-------+-------+-----------+--------+--------+--------------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  | vs Triton    |
+------+-------+-------+-----------+--------+--------+--------------+--------------+
+|    8 |  4096 |  7168 | 34.6      | 13.6   | 852.3  | 2.98x faster | 2.14x faster |
+|    8 |  7168 | 18432 | 78.9      | 26.8   | 1677.3 | 0.90x slower | 1.13x faster |
+|    8 | 18432 |  7168 | 81.2      | 26.0   | 1631.1 | 0.85x slower | 1.39x faster |
+|   64 |  4096 |  7168 | 36.9      | 101.9  | 822.9  | 1.87x faster | 2.15x faster |
+|   64 |  7168 | 18432 | 87.4      | 193.4  | 1535.2 | 0.79x slower | 1.13x faster |
+|   64 | 18432 |  7168 | 85.0      | 199.0  | 1587.6 | 0.83x slower | 1.41x faster |
+|   64 | 24576 |  1536 | 28.0      | 172.8  | 1465.8 | 2.51x faster | 1.35x faster |
+|   64 | 32768 |   512 | 28.8      | 74.5   | 728.5  | 2.37x faster | 1.34x faster |
+|   64 |  7168 | 16384 | 77.9      | 193.0  | 1532.8 | 0.89x slower | 1.11x faster |
+|  128 |  4096 |  7168 | 39.1      | 192.4  | 802.0  | 3.61x faster | 2.32x faster |
+|  128 |  7168 | 18432 | 93.7      | 360.8  | 1454.2 | 0.77x slower | 1.54x faster |
+|  128 | 18432 |  7168 | 85.7      | 394.8  | 1608.0 | 0.81x slower | 2.68x faster |
+| 1024 |  4096 |  7168 | 99.7      | 603.1  | 452.2  | 0.90x slower | 2.43x faster |
+| 1024 | 18432 |  7168 | 331.3     | 816.7  | 534.9  | 0.84x slower | 2.71x faster |
+| 2048 |  4096 |  7168 | 198.3     | 606.6  | 306.7  | 0.88x slower | 2.34x faster |
+| 4096 |  4096 |  7168 | 392.2     | 613.2  | 235.3  | 0.86x slower | 2.30x faster |
+------+-------+-------+-----------+--------+--------+--------------+--------------+
+===== AVERAGE PERFORMANCE =====
+----------------+------------+----------+---------------+
+| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) |
+----------------+------------+----------+---------------+
+| DeepGEMM       | 310.98     | 1052.10  | 0.11          |
+| vLLM Triton    | 144.30     | 715.60   | 0.23          |
+| vLLM CUTLASS   | 286.78     | 1076.67  | 0.11          |
+----------------+------------+----------+---------------+
+===== AVERAGE SPEEDUPS =====
+-----------------------------+--------------+
+| Comparison                  | Speedup      |
+-----------------------------+--------------+
+| DeepGEMM vs vLLM Triton     | 1.71x faster |
+| DeepGEMM vs vLLM CUTLASS    | 0.94x slower |
+| vLLM CUTLASS vs vLLM Triton | 1.84x faster |
+-----------------------------+--------------+
+===== ACCURACY COMPARISON =====
+----------------+-----------------------+
+| Implementation | Avg Diff vs Reference |
+----------------+-----------------------+
+| DeepGEMM       | 0.000684              |
+| vLLM Triton    | 0.000684              |
+| vLLM CUTLASS   | 0.000684              |
+----------------+-----------------------+
+```
--- a/offline_benchmark_test/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/offline_benchmark_test/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# fmt: off
+# ruff: noqa: E501
+import time
+# Import DeepGEMM functions
+import deep_gemm
+import torch
+from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
+# Import vLLM functions
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8,
+    w8a8_block_fp8_matmul,
+)
+from vllm.triton_utils import triton
+# Copied from
+# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
+def per_token_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Convert tensor to FP8 format with per-token scaling."""
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
+        torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
+# Copied from
+# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
+def per_block_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Convert tensor to FP8 format with per-block scaling."""
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+def benchmark_shape(m: int,
+                    n: int,
+                    k: int,
+                    warmup: int = 100,
+                    repeat: int = 10000,
+                    verbose: bool = False) -> dict:
+    """Benchmark all implementations for a specific (m, n, k) shape."""
+    if verbose:
+        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
+    # Create test tensors
+    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
+    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
+    # Reference result in BF16
+    torch.cuda.synchronize()
+    C_ref = A @ B.t()
+    # Pre-quantize B for all implementations
+    # (weights can be pre-quantized offline)
+    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
+    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
+    # Block size configuration
+    block_size = [128, 128]
+    # Pre-quantize A for all implementations
+    A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
+    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
+    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
+    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
+        A, block_size[1], column_major_scales=True)
+    # === DeepGEMM Implementation ===
+    def deepgemm_gemm():
+        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
+                                       (B_deepgemm, B_scale_deepgemm),
+                                       C_deepgemm)
+        return C_deepgemm
+    # === vLLM Triton Implementation ===
+    def vllm_triton_gemm():
+        return w8a8_block_fp8_matmul(A_vllm,
+                                     B_vllm,
+                                     A_scale_vllm,
+                                     B_scale_vllm,
+                                     block_size,
+                                     output_dtype=torch.bfloat16)
+    # === vLLM CUTLASS Implementation ===
+    def vllm_cutlass_gemm():
+        return ops.cutlass_scaled_mm(A_vllm_cutlass,
+                                     B_vllm.T,
+                                     scale_a=A_scale_vllm_cutlass,
+                                     scale_b=B_scale_vllm.T,
+                                     out_dtype=torch.bfloat16)
+    # Run correctness check first
+    if verbose:
+        print("Running correctness check...")
+    C_deepgemm = deepgemm_gemm()
+    C_vllm_triton = vllm_triton_gemm()
+    C_vllm_cutlass = vllm_cutlass_gemm()
+    deepgemm_diff = calc_diff(C_deepgemm, C_ref)
+    vllm_triton_diff = calc_diff(C_vllm_triton, C_ref)
+    vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref)
+    if verbose:
+        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
+        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
+        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
+        print("vLLM Triton vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
+        print("vLLM CUTLASS vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
+    # Benchmark implementations
+    implementations = {
+        "DeepGEMM": deepgemm_gemm,
+        "vLLM Triton": vllm_triton_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm
+    }
+    benchmark_results = {
+        "shape": {
+            "m": m,
+            "n": n,
+            "k": k
+        },
+        "implementations": {}
+    }
+    for name, func in implementations.items():
+        # Warmup
+        for _ in range(warmup):
+            func()
+            torch.cuda.synchronize()
+        # Timing loop
+        torch.cuda.synchronize()
+        start = time.time()
+        for _ in range(repeat):
+            func()
+        torch.cuda.synchronize()
+        end = time.time()
+        # Calculate timing and TFLOPS
+        avg_time_ms = (end - start) / repeat * 1000
+        avg_time_us = avg_time_ms * 1000
+        tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12
+        gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3)
+        benchmark_results["implementations"][name] = {
+            "time_ms": avg_time_ms,
+            "time_us": avg_time_us,
+            "tflops": tflops,
+            "gb_s": gb_s,
+            "diff": {
+                "DeepGEMM":
+                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
+                "Reference":
+                deepgemm_diff if name == "DeepGEMM" else
+                (vllm_triton_diff
+                 if name == "vLLM Triton" else vllm_cutlass_diff)
+            }
+        }
+        if verbose:
+            print(
+                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
+            )
+    # Calculate speedups
+    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
+    for name, data in benchmark_results["implementations"].items():
+        if name != "DeepGEMM":
+            speedup = baseline / data["time_ms"]
+            benchmark_results["implementations"][name][
+                "speedup_vs_deepgemm"] = speedup
+            if verbose:
+                print(f"DeepGEMM is {1/speedup:.2f}x "
+                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
+        "time_ms"]
+    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
+        "time_ms"]
+    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
+    benchmark_results["implementations"]["vLLM CUTLASS"][
+        "speedup_vs_triton"] = cutlass_vs_triton
+    if verbose:
+        print(
+            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
+            f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton"
+        )
+    return benchmark_results
+def format_table_row(values, widths):
+    """Format a row with specified column widths."""
+    return "| " + " | ".join(f"{val:{w}}"
+                             for val, w in zip(values, widths)) + " |"
+def print_table(headers, rows, title=None):
+    """Print a table with headers and rows."""
+    if title:
+        print(f"\n{title}")
+    # Calculate column widths based on headers and data
+    widths = [
+        max(len(str(h)), max(len(str(row[i])) for row in rows))
+        for i, h in enumerate(headers)
+    ]
+    # Create separator line
+    separator = "+-" + "-+-".join("-" * w for w in widths) + "-+"
+    # Print table
+    print(separator)
+    print(format_table_row(headers, widths))
+    print(separator)
+    for row in rows:
+        print(format_table_row(row, widths))
+    print(separator)
+def format_speedup(value):
+    """Format speedup value with indicator if it's faster or slower."""
+    return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}"
+def run_benchmarks(verbose: bool = False):
+    """Run benchmarks for a set of common shapes."""
+    print("===== STARTING FP8 GEMM BENCHMARK =====")
+    # Make sure we're using the GPU
+    if not torch.cuda.is_available():
+        print("CUDA not available! Tests require GPU.")
+        return
+    # Print system information
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA version: {torch.version.cuda}")
+    print(f"Triton version: {triton.__version__}")
+    print(f"Using device: {torch.cuda.get_device_name()}")
+    # Enable TF32 for better performance
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Set seeds for reproducibility
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+    # Define benchmark shapes (m, n, k)
+    shapes = [
+        (8, 4096, 7168),
+        (8, 7168, 18432),
+        (8, 18432, 7168),
+        (64, 4096, 7168),
+        (64, 7168, 18432),
+        (64, 18432, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 18432),
+        (128, 18432, 7168),
+        (1024, 4096, 7168),
+        (1024, 18432, 7168),
+        (2048, 4096, 7168),
+        (4096, 4096, 7168),
+    ]
+    shapes = [
+        # (64, 2112, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (64, 4096, 7168),
+        (64, 7168, 2048),
+        # (128, 2112, 7168),
+        (128, 24576, 1536),
+        (128, 32768, 512),
+        (128, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 2048),
+        # (4096, 2112, 7168),
+        (4096, 24576, 1536),
+        (4096, 32768, 512),
+        (4096, 7168, 16384),
+        (4096, 4096, 7168),
+        (4096, 7168, 2048),
+    ]
+    all_results = []
+    for m, n, k in shapes:
+        result = benchmark_shape(m, n, k, verbose=verbose)
+        all_results.append(result)
+    # Print results in a nicely formatted table
+    print("\n===== PERFORMANCE COMPARISON =====")
+    # Print DeepGEMM table
+    deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"]
+    deepgemm_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["DeepGEMM"]
+        deepgemm_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
+        ])
+    print_table(deepgemm_headers,
+                deepgemm_rows,
+                title="DeepGEMM Implementation:")
+    # Print vLLM Triton table
+    triton_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
+    ]
+    triton_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM Triton"]
+        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
+        triton_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(speedup)
+        ])
+    print_table(triton_headers,
+                triton_rows,
+                title="vLLM Triton Implementation:")
+    # Print vLLM CUTLASS table
+    cutlass_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
+        "vs Triton"
+    ]
+    cutlass_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM CUTLASS"]
+        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
+        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
+        cutlass_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(vs_deepgemm),
+            format_speedup(vs_triton)
+        ])
+    print_table(cutlass_headers,
+                cutlass_rows,
+                title="vLLM CUTLASS Implementation:")
+    # Calculate and print averages
+    print("\n===== AVERAGE PERFORMANCE =====")
+    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
+    avg_metrics = {
+        impl: {
+            "tflops": 0,
+            "gb_s": 0,
+            "time_ms": 0
+        }
+        for impl in implementations
+    }
+    for result in all_results:
+        for impl in implementations:
+            impl_data = result["implementations"][impl]
+            avg_metrics[impl]["tflops"] += impl_data["tflops"]
+            avg_metrics[impl]["gb_s"] += impl_data["gb_s"]
+            avg_metrics[impl]["time_ms"] += impl_data["time_ms"]
+    num_shapes = len(all_results)
+    avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"]
+    avg_rows = []
+    for impl in implementations:
+        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
+        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
+        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
+        avg_rows.append([
+            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
+        ])
+    print_table(avg_headers, avg_rows)
+    # Calculate average speedups
+    avg_speedups = {
+        "DeepGEMM vs vLLM Triton": 0,
+        "DeepGEMM vs vLLM CUTLASS": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0
+    }
+    for result in all_results:
+        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
+        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
+            "time_ms"]
+        avg_speedups[
+            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups[
+            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+        avg_speedups[
+            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
+    print("\n===== AVERAGE SPEEDUPS =====")
+    speedup_headers = ["Comparison", "Speedup"]
+    speedup_rows = []
+    for comparison, total in avg_speedups.items():
+        avg_speedup = total / num_shapes
+        status = "faster" if avg_speedup > 1 else "slower"
+        speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"])
+    print_table(speedup_headers, speedup_rows)
+    # Average accuracy comparison
+    print("\n===== ACCURACY COMPARISON =====")
+    avg_diff = {impl: 0 for impl in implementations}
+    for result in all_results:
+        for impl in implementations:
+            avg_diff[impl] += result["implementations"][impl]["diff"][
+                "Reference"]
+    diff_headers = ["Implementation", "Avg Diff vs Reference"]
+    diff_rows = []
+    for impl in implementations:
+        diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"])
+    print_table(diff_headers, diff_rows)
+if __name__ == "__main__":
+    run_benchmarks(verbose=False)
--- a/offline_benchmark_test/benchmarks/kernels/graph_machete_bench.py
+++ b/offline_benchmark_test/benchmarks/kernels/graph_machete_bench.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+import pickle
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import pandas as pd
+import regex as re
+import seaborn as sns
+from torch.utils.benchmark import Measurement as TMeasurement
+from vllm.utils import FlexibleArgumentParser
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("filename", type=str)
+    args = parser.parse_args()
+    with open(args.filename, "rb") as f:
+        data = pickle.load(f)
+        raw_results: list[TMeasurement] = data["results"]
+    results = defaultdict(lambda: list())
+    for v in raw_results:
+        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
+        if result is not None:
+            KN = result.group(1)
+        else:
+            raise Exception("MKN not found")
+        result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
+        if result is not None:
+            M = result.group(1)
+        else:
+            raise Exception("MKN not found")
+        kernel = v.task_spec.description
+        results[KN].append({"kernel": kernel, "batch_size": M, "median": v.median})
+    rows = int(math.ceil(len(results) / 2))
+    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
+    axs = axs.flatten()
+    for axs_idx, (shape, data) in enumerate(results.items()):
+        plt.sca(axs[axs_idx])
+        df = pd.DataFrame(data)
+        sns.lineplot(
+            data=df,
+            x="batch_size",
+            y="median",
+            hue="kernel",
+            style="kernel",
+            markers=True,
+            dashes=False,
+            palette="Dark2",
+        )
+        plt.title(f"Shape: {shape}")
+        plt.ylabel("time (median, s)")
+    plt.tight_layout()
+    plt.savefig("graph_machete_bench.pdf")
--- a/offline_benchmark_test/benchmarks/kernels/requirements.txt
+++ b/offline_benchmark_test/benchmarks/kernels/requirements.txt
+pandas
\ No newline at end of file
--- a/offline_benchmark_test/benchmarks/kernels/utils.py
+++ b/offline_benchmark_test/benchmarks/kernels/utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
+from collections.abc import Iterable
+from typing import Any, Callable, Optional
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+@dataclasses.dataclass
+class ArgPool:
+    """
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    """
+    values: Iterable[Any]
+    def __getitem__(self, index):
+        return self.values[index]
+class Bench:
+    class ArgsIterator:
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+        def reset(self):
+            self.idx = 0
+        @property
+        def n_args(self):
+            return self.n
+    def __init__(
+        self,
+        cuda_graph_params: Optional[CudaGraphBenchParams],
+        label: str,
+        sub_label: str,
+        description: str,
+        fn: Callable,
+        *args,
+        **kwargs,
+    ):
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(*args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list, self.kwargs_list)
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+        # benchmark run params
+        self.min_run_time = 1
+    def collapse_argpool(self, *args, **kwargs):
+        argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [
+            arg for arg in kwargs.values() if isinstance(arg, ArgPool)
+        ]
+        if len(argpool_args) == 0:
+            return [args], [kwargs]
+        # Make sure all argpools are of the same size
+        argpool_size = len(argpool_args[0].values)
+        assert all([argpool_size == len(arg.values) for arg in argpool_args])
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(argpool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+        for i in range(argpool_size):
+            # collapse args; Just pick the ith value
+            args_list[i] = tuple(
+                [arg[i] if isinstance(arg, ArgPool) else arg for arg in args_list[i]]
+            )
+            # collapse kwargs
+            kwargs_i = kwargs_list[i]
+            arg_pool_keys = [k for k, v in kwargs_i.items() if isinstance(v, ArgPool)]
+            for k in arg_pool_keys:
+                # again just pick the ith value
+                kwargs_i[k] = kwargs_i[k][i]
+            kwargs_list[i] = kwargs_i
+        return args_list, kwargs_list
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(2):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+    def run_cudagrah(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {"g": self.g}
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=(
+                f"{self.label}"
+                f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops"
+            ),
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = """
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    """
+            stmt = """
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    """
+            globals = {"fn": self.fn, "args_iterator": self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+            setup = ""
+            stmt = """
+                    fn(*args, **kwargs)
+                   """
+            globals = {"fn": self.fn, "args": args, "kwargs": kwargs}
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagrah()
+        else:
+            timer = self.run_eager()
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")
--- a/offline_benchmark_test/benchmarks/kernels/weight_shapes.py
+++ b/offline_benchmark_test/benchmarks/kernels/weight_shapes.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "meta-llama/Llama-3.1-405b-hf": [
+        ([16384, 18432], 1),
+        ([16384, 16384], 0),
+        ([16384, 106496], 1),
+        ([53248, 16384], 0),
+    ],
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-3.3-70B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+    "mistralai/Mistral-Large-Instruct-2407": [
+        ([12288, 14336], 1),
+        ([12288, 12288], 0),
+        ([12288, 57344], 1),
+        ([28672, 12288], 0),
+    ],
+    "Qwen/Qwen2.5-7B-Instruct": [
+        ([3584, 4608], 1),
+        ([3584, 3584], 0),
+        ([3584, 37888], 1),
+        ([18944, 3584], 0),
+    ],
+    "Qwen/Qwen2.5-32B-Instruct": [
+        ([5120, 7168], 1),
+        ([5120, 5120], 0),
+        ([5120, 55296], 1),
+        ([27648, 5120], 0),
+    ],
+    "Qwen/Qwen2.5-72B-Instruct": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 59136], 1),
+        ([29568, 8192], 0),
+    ],
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
+        ([2048, 3072], 1),
+        ([2048, 4096], 1),
+        ([2048, 2048], 0),
+        ([2048, 576], 0),
+        ([2048, 21888], 1),
+        ([10944, 2048], 0),
+        ([2048, 2816], 1),
+        ([1408, 2048], 0),
+    ],
+}
--- a/offline_benchmark_test/benchmarks/overheads/benchmark_hashing.py
+++ b/offline_benchmark_test/benchmarks/overheads/benchmark_hashing.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import cProfile
+import pstats
+from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
+LONG_PROMPT = " ".join(LONG_PROMPT)
+def main(args):
+    llm = LLM(
+        model=args.model,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        tensor_parallel_size=args.tensor_parallel_size,
+    )
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    profiler = cProfile.Profile()
+    print("------warm up------")
+    for i in range(3):
+        output = llm.generate(LONG_PROMPT, sampling_params)
+        print(output[0].outputs[0].text)
+    print("------start generating------")
+    for i in range(3):
+        profiler.runctx(
+            "llm.generate(LONG_PROMPT, sampling_params)", globals(), locals()
+        )
+    # analyze the runtime of hashing function
+    stats = pstats.Stats(profiler)
+    stats.sort_stats("cumulative")
+    total_time = 0
+    total_calls = 0
+    for func in stats.stats:
+        if "hash_of_block" in func[2]:
+            total_time = stats.stats[func][3]
+            total_calls = stats.stats[func][0]
+    percentage = (total_time / stats.total_tt) * 100
+    print(
+        f"Hashing took {total_time:.2f} seconds,{percentage:.2f}% of the total runtime."
+    )
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of hashing function in"
+        "automatic prefix caching."
+    )
+    parser.add_argument("--model", type=str, default="lmsys/longchat-7b-16k")
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--output-len", type=int, default=10)
+    parser.add_argument(
+        "--enable-prefix-caching", action="store_true", help="enable prefix caching"
+    )
+    args = parser.parse_args()
+    main(args)
--- a/offline_benchmark_test/benchmarks/pyproject.toml
+++ b/offline_benchmark_test/benchmarks/pyproject.toml
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+[tool.ruff]
+line-length = 88
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+[tool.ruff.lint.isort]
+known-first-party = ["vllm"]
+[tool.ruff.format]
+docstring-code-format = true
\ No newline at end of file
--- a/offline_benchmark_test/benchmarks/run_structured_output_benchmark.sh
+++ b/offline_benchmark_test/benchmarks/run_structured_output_benchmark.sh
+#!/bin/bash
+# default values
+MODEL=${MODEL:-"Qwen/Qwen2.5-7B-Instruct"}
+BACKEND=${BACKEND:-"vllm"}
+DATASET=${DATASET:-"xgrammar_bench"}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUTPUT_DIR=${OUTPUT_DIR:-"$SCRIPT_DIR/structured_output_benchmark_results"}
+PORT=${PORT:-8000}
+STRUCTURED_OUTPUT_RATIO=${STRUCTURED_OUTPUT_RATIO:-1}
+TOTAL_SECONDS=${TOTAL_SECONDS:-90}
+MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-300}
+TOKENIZER_MODE=${TOKENIZER_MODE:-"auto"}
+usage() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --model MODEL                  Model to benchmark (default: $MODEL)"
+    echo "  --backend BACKEND              Backend to use (default: $BACKEND)" 
+    echo "  --dataset DATASET              Dataset to use (default: $DATASET)"
+    echo "  --max-new-tokens N             Maximum number of tokens to generate (default: $MAX_NEW_TOKENS)"
+    echo "  --output-dir DIR               Output directory for results (default: $OUTPUT_DIR)"
+    echo "  --port PORT                    Port to use (default: $PORT)"
+    echo "  --structured-output-ratio N    Ratio of structured outputs (default: $STRUCTURED_OUTPUT_RATIO)"
+    echo "  --tokenizer-mode MODE          Tokenizer mode to use (default: $TOKENIZER_MODE)"
+    echo "  --total-seconds N              Total seconds to run the benchmark (default: $TOTAL_SECONDS)"
+    echo "  -h, --help                     Show this help message and exit"
+    exit 0
+}
+# parse command line arguments
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --model)
+      MODEL="$2"
+      shift 2
+      ;;
+    --backend)
+      BACKEND="$2"
+      shift 2
+      ;;
+    --dataset)
+      DATASET="$2"
+      shift 2
+      ;;
+    --max-new-tokens)
+      MAX_NEW_TOKENS="$2"
+      shift 2
+      ;;
+    --output-dir)
+      OUTPUT_DIR="$2"
+      shift 2
+      ;;
+    --port)
+      PORT="$2"
+      shift 2
+      ;;
+    --structured-output-ratio)
+      STRUCTURED_OUTPUT_RATIO="$2"
+      shift 2
+      ;;
+    --tokenizer-mode)
+      TOKENIZER_MODE="$2"
+      shift 2
+      ;;
+    --total-seconds)
+      TOTAL_SECONDS="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      echo "Unknown argument: $1\n"
+      usage
+      ;;
+  esac
+done
+# Create output directory if it doesn't exist
+mkdir -p "$OUTPUT_DIR"
+# Define QPS values to test
+QPS_VALUES=(25 20 15 10 5 1)
+# Common parameters
+COMMON_PARAMS="--backend $BACKEND \
+               --model $MODEL \
+               --dataset $DATASET \
+               --structured-output-ratio $STRUCTURED_OUTPUT_RATIO \
+               --save-results \
+               --result-dir $OUTPUT_DIR \
+               --output-len $MAX_NEW_TOKENS \
+               --port $PORT \
+               --tokenizer-mode $TOKENIZER_MODE"
+echo "Starting structured output benchmark with model: $MODEL"
+echo "Backend: $BACKEND"
+echo "Dataset: $DATASET"
+echo "Results will be saved to: $OUTPUT_DIR"
+echo "----------------------------------------"
+# Run benchmarks with different QPS values
+for qps in "${QPS_VALUES[@]}"; do
+  echo "Running benchmark with QPS: $qps"
+  # Get git hash and branch for the filename
+  GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
+  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
+  # Construct filename for this run
+  FILENAME="${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
+  NUM_PROMPTS=$(echo "$TOTAL_SECONDS * $qps" | bc)
+  NUM_PROMPTS=${NUM_PROMPTS%.*}  # Remove fractional part
+  echo "Running benchmark with $NUM_PROMPTS prompts"
+  # Run the benchmark
+  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
+    --request-rate $qps \
+    --result-filename "$FILENAME" \
+    --num-prompts $NUM_PROMPTS
+  echo "Completed benchmark with QPS: $qps"
+  echo "----------------------------------------"
+done
+echo "All benchmarks completed!"
+echo "Results saved to: $OUTPUT_DIR"