Merge tag 'v0.6.5' into v0.6.5-dev

4d3a2c28 · zhuwenwen · 92ec5d8e · 2d1b9baa · 4d3a2c28 · 4d3a2c28
Commit 4d3a2c28 authored Dec 30, 2024 by zhuwenwen
20 changed files
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+import pickle as pkl
+import time
+from dataclasses import dataclass
+from itertools import product
+from typing import Callable, Iterable, List, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from tqdm import tqdm
+
+import vllm._custom_ops as ops
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+@dataclass
+class bench_params_t:
+    num_tokens: int
+    hidden_size: int
+    add_residual: bool
+    dtype: torch.dtype
+
+    def description(self):
+        return (f'N {self.num_tokens} '
+                f'x D {self.hidden_size} '
+                f'x R {self.add_residual} '
+                f'x DT {self.dtype}')
+
+
+def get_bench_params() -> List[bench_params_t]:
+    ## Test Fixtures
+    NUM_TOKENS = [2**x for x in range(11)]
+    HIDDEN_SIZES = list(range(1024, 8129, 1024))
+    ADD_RESIDUAL = [True, False]
+    DTYPES = [torch.bfloat16, torch.float]
+
+    combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES)
+    bench_params = list(map(lambda x: \
+        bench_params_t(x[0], x[1], x[2], x[3]), combinations))
+    return bench_params
+
+
+# Reference impls
+def unfused_int8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                      residual: Optional[torch.Tensor],
+                      quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _, _ = ops.scaled_int8_quant(torch_out)
+
+
+def unfused_fp8_impl(rms_norm_layer: RMSNorm, x: torch.Tensor,
+                     residual: Optional[torch.Tensor],
+                     quant_dtype: torch.dtype):
+    # Norm
+    torch_out = None
+    if residual is None:
+        torch_out = rms_norm_layer.forward_cuda(x, residual)
+    else:
+        torch_out, _ = rms_norm_layer.forward_cuda(x, residual)
+
+    # Quant
+    torch_out, _ = ops.scaled_fp8_quant(torch_out)
+
+
+def fused_impl(
+        rms_norm_layer: RMSNorm,  # this stores the weights
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        quant_dtype: torch.dtype):
+    out, _ = ops.rms_norm_dynamic_per_token_quant(x,
+                                                  rms_norm_layer.weight,
+                                                  1e-6,
+                                                  quant_dtype,
+                                                  residual=residual)
+
+
+# Bench functions
+def bench_fn(rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor,
+             quant_dtype: torch.dtype, label: str, sub_label: str,
+             fn: Callable, description: str) -> TMeasurement:
+
+    min_run_time = 1
+
+    globals = {
+        "rms_norm_layer": rms_norm_layer,
+        "x": x,
+        "residual": residual,
+        "quant_dtype": quant_dtype,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(rms_norm_layer, x, residual, quant_dtype)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+def bench(params: bench_params_t, label: str, sub_label: str) \
+        -> Iterable[TMeasurement]:
+
+    # Make inputs
+    layer = RMSNorm(params.hidden_size, 1e-6).to(dtype=params.dtype)
+    # Make weights
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    # Make inputs
+    scale = 1 / params.hidden_size
+    x = torch.randn(params.num_tokens,
+                    params.hidden_size,
+                    dtype=params.dtype,
+                    device='cuda') * scale
+    residual = (torch.randn_like(x) * scale).to(device='cuda') \
+            if params.add_residual else None
+
+    timers = []
+
+    # unfused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label,
+                 unfused_int8_impl, "unfused_int8_impl"))
+
+    # unfused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 unfused_fp8_impl, "unfused_fp8_impl"))
+
+    # fused int8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.int8, label, sub_label, fused_impl,
+                 "fused_int8_impl"))
+
+    # fused fp8 impl.
+    timers.append(
+        bench_fn(layer, x, residual, torch.float8_e4m3fn, label, sub_label,
+                 fused_impl, "fused_fp8_impl"))
+
+    print_timers(timers)
+
+    return timers
+
+
+# launch bench
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def main():
+    torch.set_default_device('cuda')
+    bench_params = get_bench_params()
+
+    timers = []
+    for bp in tqdm(bench_params):
+        timers.extend(
+            bench(bp, "rms-norm-dynamic-per-token-quant", bp.description()))
+    print_timers(timers)
+
+    # pickle all the results
+    timestamp = int(time.time())
+    with open(f"rms_norm_dpt_quant-{timestamp}.pkl", "wb") as f:
+        pkl.dump(timers, f)
+
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -3,8 +3,8 @@ import time
 import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        seed_everything)
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser


 @torch.inference_mode()
@@ -16,7 +16,7 @@ def main(num_tokens: int,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
-    seed_everything(seed)
+    current_platform.seed_everything(seed)
    torch.set_default_device("cuda")

    layer = RMSNorm(hidden_size).to(dtype=dtype)

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -2,8 +2,10 @@ import argparse
 import copy
 import itertools
 import math
+import os
 import pickle as pkl
 import time
+from dataclasses import dataclass
 from itertools import product
 from typing import Callable, Iterable, List, Optional, Tuple

@@ -15,11 +17,12 @@ from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales,
+    marlin_zero_points)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, pack_rows, quantize_weights)
+    pack_rows, quantize_weights)
 from vllm.scalar_type import ScalarType, scalar_types
 from vllm.utils import FlexibleArgumentParser

@@ -27,149 +30,350 @@ DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
 DEFAULT_TP_SIZES = [1]

+NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False)
+
+if NVTX_PROFILE:
+    import nvtx
+
+
+def terse_type_name(dt):
+    return {
+        torch.bfloat16: "bf16",
+        torch.float16: "fp16",
+        torch.int8: "int8",
+        torch.float8_e4m3fn: "fp8",
+        torch.bfloat16: "bf16",
+        torch.float: "float",
+        torch.int: "int",
+    }[dt]
+
+
+@dataclass
+class BenchmarkTensors:
+    w_ref: torch.Tensor
+    a: torch.Tensor
+
+    w_q: torch.Tensor
+    group_size: Optional[int]
+    wtype: ScalarType
+    w_g_s: torch.Tensor
+    w_g_zp: Optional[torch.Tensor]
+    w_ch_s: Optional[torch.Tensor]
+    w_tok_s: Optional[torch.Tensor]
+
+
+@dataclass
+class TypeConfig:
+    act_type: torch.dtype
+    weight_type: ScalarType
+    output_type: Optional[torch.dtype]
+    group_scale_type: Optional[torch.dtype]
+    group_zero_type: Optional[torch.dtype]
+    channel_scale_type: Optional[torch.dtype]
+    token_scale_type: Optional[torch.dtype]
+
+
+def rand_data(shape, dtype=torch.float16, scale=1):
+    if dtype.is_floating_point:
+        return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype)
+    else:
+        return torch.randint(-15, 15, shape, dtype=dtype, device="cuda")
+
+
+def quantize_and_pack(atype: torch.dtype,
+                      w: torch.Tensor,
+                      wtype: ScalarType,
+                      stype: Optional[torch.dtype],
+                      group_size: Optional[int],
+                      zero_points: bool = False):
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    w_ref, w_q, w_s, w_zp = quantize_weights(
+        w,
+        wtype,
+        group_size=group_size,
+        zero_points=zero_points,
+        # to match how the kernel applies zps
+        ref_zero_points_after_scales=True)

-def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
-    w_q = w_q.t().contiguous().t()  # make col major
-    return ops.machete_prepack_B(w_q, wtype)
+    return w_ref, w_q, w_s, w_zp


-def make_bench_tensors(
-    atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
-    k: int
-) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
-                                    torch.tensor]]]:
-    assert wtype.is_integer(), "TODO: support floating point weights"
+def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
+                         group_size: Optional[int]) -> List[BenchmarkTensors]:
+    m, n, k = shape

    # we want to make sure that weights don't fit into L2 cache between runs so
    #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
    #  so we target total weight size > 2*50mb
-    num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
-
-    a = torch.randn((m, k), device="cuda", dtype=atype) * 5
-    weights = [
-        torch.randn((k, n), device="cuda", dtype=atype)
-        for _ in range(num_weights)
-    ]
-    quanitized_weights = [
-        quantize_weights(w, wtype, group_size) for w in weights
-    ]
-
-    return a, quanitized_weights
+    num_weights = math.ceil(2 * 50 * 1024**2 * 8 /
+                            (k * n * types.weight_type.size_bits))
+
+    a = rand_data((m, k), types.act_type, scale=5)
+
+    benchmark_tensors: List[BenchmarkTensors] = []
+    for _ in range(num_weights):
+        w = rand_data((k, n), types.act_type, scale=5)
+
+        if types.group_scale_type is not None:
+            w = w.to(types.group_scale_type)
+        if w.dtype.itemsize == 1:
+            w = w.to(torch.float16)
+
+        w_ref, w_q_packed, w_s, w_zp = quantize_and_pack(
+            a.dtype, w, types.weight_type, types.group_scale_type, group_size,
+            types.group_zero_type is not None)
+
+        if not a.dtype.is_floating_point:
+            aiinfo = torch.iinfo(a.dtype)
+            w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max)
+
+        w_ref = w_ref.to(torch.float32)
+
+        w_ch_s = None if types.channel_scale_type is None else\
+            rand_data((n,), types.channel_scale_type)
+        w_tok_s = None if types.token_scale_type is None else\
+            rand_data((m,), types.token_scale_type)
+
+        benchmark_tensors.append(
+            BenchmarkTensors(w_ref=w_ref,
+                             a=a,
+                             w_q=w_q_packed,
+                             wtype=types.weight_type,
+                             w_g_s=w_s,
+                             w_g_zp=w_zp,
+                             group_size=group_size,
+                             w_ch_s=w_ch_s,
+                             w_tok_s=w_tok_s))
+
+    return benchmark_tensors
+
+
+def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    a = bt.a
+    w = bt.w_ref.to(bt.a.dtype)  # use float reference tensor
+    if a.dtype not in [torch.float16, torch.bfloat16]:
+        a = a.to(torch.float16)
+        w = w.to(torch.float16)
+    return lambda: torch.matmul(a, w)
+
+
+def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    if bt.w_ch_s is not None and bt.w_tok_s is not None:
+        scale_a = bt.w_tok_s.to(torch.float32)
+        scale_b = bt.w_ch_s.to(torch.float32)
+    else:
+        scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+        scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device)
+    w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t()
+    return lambda: ops.cutlass_scaled_mm(
+        bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16)
+
+
+def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
+    device = bt.a.device
+
+    workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    if bt.w_g_zp is None:
+        w_zp = torch.empty(0, dtype=torch.int, device=device)
+    else:
+        w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0],
+                                  bt.w_ref.shape[1], bt.wtype.size_bits)
+
+    if bt.group_size is None:
+        w_s = torch.tensor([], device="cuda", dtype=torch.half)
+    else:
+        w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0],
+                                    bt.w_ref.shape[1], bt.group_size)
+
+    sort_indices = torch.empty(0, dtype=torch.int, device=device)
+    g_idx = torch.empty(0, dtype=torch.int, device=device)
+    w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0],
+                                 bt.w_ref.shape[1], bt.wtype.size_bits)
+
+    if bt.a.dtype.is_floating_point:
+        assert bt.w_ch_s is None
+        assert bt.w_tok_s is None
+        assert bt.group_size is not None
+
+        fn = lambda: ops.gptq_marlin_gemm(a=bt.a,
+                                          b_q_weight=w_q,
+                                          b_scales=w_s,
+                                          b_zeros=w_zp,
+                                          g_idx=g_idx,
+                                          perm=sort_indices,
+                                          workspace=workspace.scratch,
+                                          b_q_type=bt.wtype,
+                                          size_m=bt.a.shape[0],
+                                          size_n=bt.w_ref.shape[1],
+                                          size_k=bt.w_ref.shape[0],
+                                          is_k_full=True,
+                                          is_zp_float=False)
+    else:
+        assert bt.a.dtype == torch.int8
+        assert bt.wtype == scalar_types.uint4b8
+
+        if bt.w_ch_s is not None:
+            s_ch = bt.w_ch_s.to(torch.float32)
+        else:
+            s_ch = torch.ones(bt.w_ref.shape[1],
+                              dtype=torch.float32,
+                              device=device)
+
+        if bt.w_tok_s is not None:
+            s_tok = bt.w_tok_s.to(torch.float32)
+        else:
+            s_tok = torch.ones(bt.a.shape[0],
+                               dtype=torch.float32,
+                               device=device)
+
+        fn = lambda: ops.marlin_qqq_gemm(a=bt.a,
+                                         b_q_weight=w_q,
+                                         s_group=w_s,
+                                         s_tok=s_tok,
+                                         s_ch=s_ch,
+                                         workspace=workspace.scratch,
+                                         size_m=bt.a.shape[0],
+                                         size_n=bt.w_ref.shape[1],
+                                         size_k=bt.w_ref.shape[0])
+
+    return fn
+
+
+def machete_create_bench_fn(bt: BenchmarkTensors,
+                            out_type=torch.dtype,
+                            schedule=None) -> Callable:
+    w_q = bt.w_q.t().contiguous().t()  # make col major
+    w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype,
+                                None if bt.w_g_s is None else bt.w_g_s.dtype)
+
+    w_g_zp = bt.w_g_zp
+    if w_g_zp is not None:
+        w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype))
+
+    return lambda: ops.machete_mm(
+        a=bt.a,
+        b_q=bt.w_q,
+        b_type=bt.wtype,
+        b_group_scales=bt.w_g_s,
+        b_group_zeros=w_g_zp,
+        b_group_size=bt.group_size,
+        b_channel_scales=bt.w_ch_s,
+        a_token_scales=bt.w_tok_s,
+        out_type=out_type,
+        schedule=schedule,
+    )


 # impl

-
 # bench
-def bench_fn(label: str, sub_label: str, description: str,
-             fn: Callable) -> TMeasurement:

-    min_run_time = 1
-    return TBenchmark.Timer(
-        stmt="fn()",
+
+def bench_fns(label: str, sub_label: str, description: str,
+              fns: List[Callable]):
+
+    min_run_time = 1 if not NVTX_PROFILE else 0.1
+    res = TBenchmark.Timer(
+        stmt="""
+        for fn in fns:
+            fn()
+        """,
        globals={
-            "fn": fn
+            "fns": fns
        },
        label=label,
        sub_label=sub_label,
        description=description,
    ).blocked_autorange(min_run_time=min_run_time)

+    if NVTX_PROFILE:
+        with nvtx.annotate("mm-bench"), nvtx.annotate(
+                f"{label}|{sub_label}|{description}"):
+            fns[0]()

-def loop_over_weights(
-    a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
-                                         torch.tensor, torch.tensor]],
-    fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
-                 None]):
-    for w_ref, w_q, w_s, _ in weights:
-        fn(a, w_ref, w_q, w_s)
+    return res


 _SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
 _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None


-def bench(atype: torch.dtype,
-          wtype: ScalarType,
+def bench(types: TypeConfig,
          group_size: int,
          m: int,
          k: int,
          n: int,
          label: str,
          sub_label: str,
-          benchmark_marlinv1: bool = True,
-          sweep_schedules: bool = True) -> Iterable[TMeasurement]:
-    global _SWEEP_SCHEDULES_RESULTS
-
-    a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
-    sub_label += f", L={len(weights)}"
-
-    weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
-                       for w_ref, w_q, w_s, w_zp in weights]
+          sweep_schedules: bool = True) -> List[TMeasurement]:
+    benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
+    sub_label += f", L={len(benchmark_tensors)}"
+
+    name_type_string = f"W{types.weight_type}"+\
+                       f"-A{terse_type_name(types.act_type)}"
+    if types.group_scale_type is not None:
+        name_type_string += f"-GS{terse_type_name(types.group_scale_type)}"
+    if types.group_zero_type is not None:
+        name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}"
+    if group_size is not None:
+        name_type_string += f"-G{group_size}"
+    if types.channel_scale_type is not None:
+        name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}"
+    if types.token_scale_type is not None:
+        name_type_string += f"-TS{terse_type_name(types.token_scale_type)}"

    timers = []
    # pytorch impl
    timers.append(
-        bench_fn(
-            label, sub_label, "torch.matmul", lambda: loop_over_weights(
-                a,
-                weights,
-                lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
-            )))
+        bench_fns(
+            label, sub_label, "torch.matmul (fp16)",
+            [torch_matmul_f16_create_bench_fn(bt)
+             for bt in benchmark_tensors]))

-    if benchmark_marlinv1:
-        w_ref = weights[0][0]
-
-        w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
-        sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
-        g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
-
-        def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
-            w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
-            return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
-                                          wtype.size_bits)
-
-        def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
-            return marlin_permute_scales(w_s, *w_ref.shape, group_size)
-
-        weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
-                             marlinv1_permute_scales(w_s), w_zp)
-                            for w_ref, w_q, w_s, w_zp in weights]
-
-        workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
-                                    GPTQ_MARLIN_MAX_PARALLEL)
-
-        # marlinv1
+    if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn:
+        timers.append(
+            bench_fns(
+                label, sub_label,
+                f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [
+                    cutlass_scaled_mm_create_bench_fn(bt)
+                    for bt in benchmark_tensors
+                ]))
+
+    if types.act_type != torch.float8_e4m3fn:
        timers.append(
-            bench_fn(
-                label, sub_label, "marlin_orig", lambda: loop_over_weights(
-                    a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
-                    gptq_marlin_gemm(a,
-                                     w_q,
-                                     w_s,
-                                     w_zp_empty,
-                                     g_idx,
-                                     sort_indices,
-                                     workspace.scratch,
-                                     wtype,
-                                     size_m=a.shape[0],
-                                     size_n=w_ref.shape[1],
-                                     size_k=w_ref.shape[0],
-                                     is_k_full=True))))
+            bench_fns(label, sub_label, f"marlin ({name_type_string})",
+                      [marlin_create_bench_fn(bt)
+                       for bt in benchmark_tensors]))

    # machete
    timers.append(
-        bench_fn(
-            label, sub_label, "machete_heuristic", lambda: loop_over_weights(
-                a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
-                    a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
+        bench_fns(label, sub_label, f"machete ({name_type_string})", [
+            machete_create_bench_fn(bt, out_type=types.output_type)
+            for bt in benchmark_tensors
+        ]))

    if sweep_schedules:
+        global _SWEEP_SCHEDULES_RESULTS
+
        print("Finding best schedule for machete")
        best = None
        best_schedule = None
-        schedules = ops.machete_supported_schedules(wtype)
+        schedules = ops.machete_supported_schedules(
+            a_type=types.act_type,
+            b_type=types.weight_type,
+            group_scales_type=types.group_scale_type,
+            group_zeros_type=types.group_zero_type,
+            token_scales_type=types.token_scale_type,
+            channel_scales_type=types.channel_scale_type,
+            out_type=types.output_type)
+
+        if schedules is None or len(schedules) == 0:
+            raise ValueError("No schedules found to sweep")
+
        for schedule in reversed(schedules):
            schedule_M = int(schedule.split("_")[0].split("x")[1])

@@ -177,16 +381,11 @@ def bench(atype: torch.dtype,
            if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
                continue

-            def run(a, _, w_q, w_s, schedule=schedule):
-                ops.machete_gemm(a,
-                                 w_q,
-                                 wtype,
-                                 w_s,
-                                 b_group_size=group_size,
-                                 schedule=schedule)
-
-            res = bench_fn(label, sub_label, "machete_best",
-                           lambda: loop_over_weights(a, weights_machete, run))
+            res = bench_fns(label, sub_label, "machete_best", [
+                machete_create_bench_fn(
+                    bt, out_type=types.output_type, schedule=schedule)
+                for bt in benchmark_tensors
+            ])

            results_row = {
                "M": m,
@@ -213,25 +412,33 @@ def bench(atype: torch.dtype,


 # runner
-def print_timers(timers: Iterable[TMeasurement]):
+def print_timers(timers: List[TMeasurement]):
    compare = TBenchmark.Compare(timers)
    compare.print()


-def run(dtype: torch.dtype, sweep_schedules: bool,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+    types = TypeConfig(
+        act_type=args.act_type,
+        weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
+            else scalar_types.uint4,
+        output_type=args.out_type,
+        group_scale_type=args.group_scale_type,
+        group_zero_type=args.group_zero_type,
+        channel_scale_type=args.channel_scale_type,
+        token_scale_type=args.token_scale_type,
+    )

-    results = []
+    results: List[TMeasurement] = []
    for m, k, n in MKNs:
-        timers = bench(dtype,
-                       scalar_types.uint4b8,
-                       128,
+        timers = bench(types,
+                       args.group_size,
                       m,
                       k,
                       n,
-                       f"{dtype}-gemm",
+                       f"{args.act_type}-gemm",
                       f"MKN=({m}x{k}x{n})",
-                       sweep_schedules=sweep_schedules)
+                       sweep_schedules=args.sweep_schedules)
        print_timers(timers)
        results.extend(timers)

@@ -240,7 +447,7 @@ def run(dtype: torch.dtype, sweep_schedules: bool,

 # output makers
 def make_output(
-    data: Iterable[TMeasurement],
+    data: List[TMeasurement],
    MKNs: Iterable[Tuple[int, int, int]],
    base_description: str,
    timestamp=None,
@@ -262,17 +469,16 @@ def run_square_bench(args):
    dim_sizes = list(
        range(args.dim_start, args.dim_end + 1, args.dim_increment))
    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-
    data = run(args.dtype, args.sweep_schedules, MKNs)

    make_output(data, MKNs, f"square_bench-{args.dtype}")


 def run_range_bench(args):
-    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
-    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
    m_increment, k_increment, n_increment = \
-        [int(x) for x in args.dim_increment.split(",")]
+        (int(x) for x in args.dim_increment.split(","))
    Ms = list(range(m_start, m_end + 1, m_increment))
    Ks = list(range(k_start, k_end + 1, k_increment))
    Ns = list(range(n_start, n_end + 1, n_increment))
@@ -306,33 +512,49 @@ def run_model_bench(args):
            for k, n in KNs:
                MKNs.append((m, k, n))

-        data = run(args.dtype, args.sweep_schedules, MKNs)
+        data = run(args, MKNs)
        model_bench_data.append(data)

+    type_string = f"{args.act_type}"
+
    # Print all results
    for data, model_tp in zip(model_bench_data, models_tps):
        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print(f"== Results {type_string} {model}-TP{tp_size} ====")
        print_timers(data)

-    timestamp = int(time.time())
+    timestr = time.strftime("%Y%m%d-%H%M%S")

-    all_data = []
+    all_results = []
    for d in model_bench_data:
-        all_data.extend(d)
+        all_results.extend(d)
+
    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
+    with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f:
+        args_dict = vars(args)
+        args_dict.pop("func")
+        pkl.dump({
+            "args": args_dict,
+            "results": all_results,
+        }, f)


 if __name__ == "__main__":

    def to_torch_dtype(dt):
-        if dt == "bfloat16":
-            return torch.bfloat16
-        if dt == "float16":
-            return torch.float16
-        raise ValueError("unsupported dtype")
+        return {
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+            "int8": torch.int8,
+            "float8_e4m3fn": torch.float8_e4m3fn,
+            "int": torch.int,
+            "float": torch.float,
+        }[dt]
+
+    class ToTorchDtype(argparse.Action):
+
+        def __call__(self, parser, namespace, values, option_string=None):
+            setattr(namespace, self.dest, to_torch_dtype(values))

    parser = FlexibleArgumentParser(
        description="""
@@ -352,12 +574,42 @@ Benchmark Machete GEMM.
            """,  # noqa: E501
        formatter_class=argparse.RawTextHelpFormatter,
    )
-
    parser.add_argument(
-        "--dtype",
-        type=to_torch_dtype,
+        "--act-type",
+        action=ToTorchDtype,
        required=True,
-        help="Available options are ['bfloat16', 'float16']",
+        choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'],
+    )
+    parser.add_argument(
+        "--group-scale-type",
+        action=ToTorchDtype,
+        choices=['bfloat16', 'float16'],
+    )
+    parser.add_argument(
+        "--group-zero-type",
+        type=to_torch_dtype,
+        choices=['bfloat16', 'float16'],
+    )
+    parser.add_argument(
+        "--channel-scale-type",
+        action=ToTorchDtype,
+        choices=['float'],
+    )
+    parser.add_argument(
+        "--token-scale-type",
+        action=ToTorchDtype,
+        choices=['float'],
+    )
+    parser.add_argument(
+        "--out-type",
+        action=ToTorchDtype,
+        choices=['bfloat16', 'float16'],
+    )
+    parser.add_argument(
+        "--group-size",
+        type=int,
+        help="Available options are ['None', '-1', '128'], default=128",
+        default=128,
    )
    parser.add_argument(
        "--sweep-schedules",

--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -131,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
@@ -141,7 +141,7 @@ def bench_run(results: List[benchmark.Measurement], model: str,
    results.append(
        benchmark.Timer(
            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)",  # noqa: E501
+            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -10,7 +10,8 @@ from ray.experimental.tqdm_ray import tqdm
 from transformers import AutoConfig

 from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser


 class BenchmarkConfig(TypedDict):
@@ -88,22 +89,23 @@ def benchmark_config(
        input_gating.copy_(gating_output[i])

    def run():
-        fused_moe(
-            x,
-            w1,
-            w2,
-            input_gating,
-            topk,
-            renormalize=True,
-            inplace=True,
-            override_config=config,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_int8_w8a16=use_int8_w8a16,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-        )
+        from vllm.model_executor.layers.fused_moe import override_config
+        with override_config(config):
+            fused_moe(
+                x,
+                w1,
+                w2,
+                input_gating,
+                topk,
+                renormalize=True,
+                inplace=True,
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+            )

    # JIT compilation & warmup
    run()
@@ -166,7 +168,7 @@ class BenchmarkWorker:

    def __init__(self, seed: int) -> None:
        torch.set_default_device("cuda")
-        seed_everything(seed)
+        current_platform.seed_everything(seed)
        self.seed = seed

    def benchmark(
@@ -180,7 +182,7 @@ class BenchmarkWorker:
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
    ) -> Tuple[Dict[str, int], float]:
-        seed_everything(self.seed)
+        current_platform.seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
                                         use_int8_w8a16=use_int8_w8a16,
                                         use_fp8_w8a8=use_fp8_w8a8)

--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -5,10 +5,12 @@ from typing import List, Optional
 import torch

 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        create_kv_caches_with_random, seed_everything)
+                        create_kv_caches_with_random)
 import vllm.envs as envs

+
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512

@@ -29,7 +31,7 @@ def main(
    device: str = "cuda",
    kv_cache_dtype: Optional[str] = None,
 ) -> None:
-    seed_everything(seed)
+    current_platform.seed_everything(seed)

    scale = float(1.0 / (head_size**0.5))
    query = torch.empty(num_seqs,

--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -3,8 +3,8 @@ import time
 import torch

 from vllm import _custom_ops as ops
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        seed_everything)
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser


 @torch.inference_mode()
@@ -17,7 +17,7 @@ def main(num_tokens: int,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
-    seed_everything(seed)
+    current_platform.seed_everything(seed)
    torch.set_default_device("cuda")

    x = torch.randn(num_tokens, hidden_size, dtype=dtype)

--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
+import itertools
+from typing import Optional, Tuple, Union
+
+import torch
+import triton
+from flashinfer.norm import fused_add_rmsnorm, rmsnorm
+from torch import nn
+
+from vllm import _custom_ops as vllm_ops
+
+
+class HuggingFaceRMSNorm(nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+
+def rmsnorm_naive(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
+    naive_norm.weight = nn.Parameter(weight)
+    naive_norm = naive_norm.to(x.device)
+
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    output = naive_norm(x, residual)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_flashinfer(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        fused_add_rmsnorm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        output = rmsnorm(x, weight, eps)
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def rmsnorm_vllm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+    eps: float = 1e-6,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
+    dtype = torch.bfloat16
+    x = torch.randn(batch_size,
+                    seq_len,
+                    hidden_size,
+                    dtype=dtype,
+                    device="cuda")
+    weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+    residual = torch.randn_like(x) if use_residual else None
+
+    output_naive = rmsnorm_naive(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_flashinfer = rmsnorm_flashinfer(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+    output_vllm = rmsnorm_vllm(
+        x.clone(), weight,
+        residual.clone() if residual is not None else None)
+
+    if use_residual:
+        output_naive = output_naive[0]
+        output_flashinfer = output_flashinfer[0]
+        output_vllm = output_vllm[0]
+
+    print(f"Naive output={output_naive}")
+    print(f"FlashInfer output={output_flashinfer}")
+    print(f"VLLM output={output_vllm}")
+
+    if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
+                      rtol=1e-2) and torch.allclose(
+                          output_naive, output_vllm, atol=1e-2, rtol=1e-2):
+        print("✅ All implementations match")
+    else:
+        print("❌ Implementations differ")
+
+
+batch_size_range = [2**i for i in range(0, 7, 2)]
+seq_length_range = [2**i for i in range(6, 11, 1)]
+head_num_range = [32, 48]
+configs = list(
+    itertools.product(head_num_range, batch_size_range, seq_length_range))
+
+
+def get_benchmark(use_residual):
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=["head_num", "batch_size", "seq_len"],
+            x_vals=[list(_) for _ in configs],
+            line_arg="provider",
+            line_vals=["huggingface", "flashinfer", "vllm"],
+            line_names=["HuggingFace", "FlashInfer", "vLLM"],
+            styles=[("blue", "-"), ("green", "-"), ("red", "-")],
+            ylabel="us",
+            plot_name=
+            f"rmsnorm-perf-{'with' if use_residual else 'without'}-residual",
+            args={},
+        ))
+    def benchmark(head_num, batch_size, seq_len, provider):
+        dtype = torch.bfloat16
+        hidden_size = head_num * 128  # assuming head_dim = 128
+
+        x = torch.randn(batch_size,
+                        seq_len,
+                        hidden_size,
+                        dtype=dtype,
+                        device="cuda")
+        weight = torch.ones(hidden_size, dtype=dtype, device="cuda")
+        residual = torch.randn_like(x) if use_residual else None
+
+        quantiles = [0.5, 0.2, 0.8]
+
+        if provider == "huggingface":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_naive(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        elif provider == "flashinfer":
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_flashinfer(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+        else:
+            ms, min_ms, max_ms = triton.testing.do_bench(
+                lambda: rmsnorm_vllm(
+                    x.clone(),
+                    weight,
+                    residual.clone() if residual is not None else None,
+                ),
+                quantiles=quantiles,
+            )
+
+        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+    return benchmark
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length",
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=4096,
+        help="Hidden size (2nd dimension) of the sequence",
+    )
+    parser.add_argument("--use-residual",
+                        action="store_true",
+                        help="Whether to use residual connection")
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default="./configs/rmsnorm/",
+        help="Path to save rmsnorm benchmark results",
+    )
+
+    args = parser.parse_args()
+
+    # Run correctness test
+    calculate_diff(batch_size=args.batch_size,
+                   seq_len=args.seq_len,
+                   hidden_size=args.hidden_size,
+                   use_residual=args.use_residual)
+
+    # Get the benchmark function with proper use_residual setting
+    benchmark = get_benchmark(args.use_residual)
+    # Run performance benchmark
+    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -6,7 +6,8 @@ import torch

 from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
                                                         get_rope)
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser


 def benchmark_rope_kernels_multi_lora(
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
    max_position: int = 8192,
    base: int = 10000,
 ) -> None:
-    seed_everything(seed)
+    current_platform.seed_everything(seed)
    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
@@ -31,7 +32,7 @@ def benchmark_rope_kernels_multi_lora(
    # batched RoPE can take multiple scaling factors
    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
                            is_neox_style, {
-                                "type": "linear",
+                                "rope_type": "linear",
                                "factor": tuple(scaling_factors)
                            })
    # non-batched RoPE takes only one scaling factor, we create multiple
@@ -41,7 +42,7 @@ def benchmark_rope_kernels_multi_lora(
        non_batched_ropes.append(
            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
                     {
-                         "type": "linear",
+                         "rope_type": "linear",
                         "factor": (scaling_factor, )
                     }))


--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -20,10 +20,11 @@ if __name__ == "__main__":
    args = parser.parse_args()

    with open(args.filename, 'rb') as f:
-        data: List[TMeasurement] = pickle.load(f)
+        data = pickle.load(f)
+        raw_results: List[TMeasurement] = data["results"]

    results = defaultdict(lambda: list())
-    for v in data:
+    for v in raw_results:
        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
        if result is not None:
            KN = result.group(1)

--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@@ -40,4 +40,10 @@ WEIGHT_SHAPES = {
        ([8192, 57344], 1),
        ([28672, 8192], 0),
    ],
+    "meta-llama/Llama-3.1-405b-hf": [
+        ([16384, 18432], 1),
+        ([16384, 16384], 0),
+        ([16384, 106496], 1),
+        ([53248, 16384], 0),
+    ],
 }
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -4,13 +4,13 @@ PORT=8000
 MODEL=$1
 TOKENS=$2

-docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
-           -v $PWD/data:/data \
+docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
+           -v "$PWD/data:/data" \
           ghcr.io/huggingface/text-generation-inference:2.2.0 \
-           --model-id $MODEL \
+           --model-id "$MODEL" \
           --sharded false  \
           --max-input-length 1024 \
           --max-total-tokens 2048 \
           --max-best-of 5 \
           --max-concurrent-requests 5000 \
-           --max-batch-total-tokens $TOKENS
+           --max-batch-total-tokens "$TOKENS"
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -16,7 +16,6 @@ def main(args):
        enforce_eager=True,
        enable_prefix_caching=True,
        tensor_parallel_size=args.tensor_parallel_size,
-        use_v2_block_manager=args.use_v2_block_manager,
    )

    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
@@ -56,8 +55,5 @@ if __name__ == "__main__":
    parser.add_argument('--enable-prefix-caching',
                        action='store_true',
                        help='enable prefix caching')
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        help='Use BlockSpaceMangerV2')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/structured_schemas/structured_schema_1.json
+++ b/benchmarks/structured_schemas/structured_schema_1.json
+{
+    "$schema":
+    "https://json-schema.org/draft/2020-12/schema",
+    "title":
+    "User Profile",
+    "type":
+    "object",
+    "properties": {
+        "userId": {
+            "type": "string",
+            "description": "Unique identifier for the user."
+        },
+        "personalInfo": {
+            "type": "object",
+            "properties": {
+                "firstName": {
+                    "type": "string",
+                    "description": "The user's first name."
+                },
+                "lastName": {
+                    "type": "string",
+                    "description": "The user's last name."
+                },
+                "age": {
+                    "type": "integer",
+                    "minimum": 0,
+                    "description": "The user's age."
+                },
+                "phoneNumbers": {
+                    "type":
+                    "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "type": {
+                                "type": "string",
+                                "enum": ["home", "work", "mobile"],
+                                "description": "Type of phone number."
+                            },
+                            "number": {
+                                "type": "string",
+                                "pattern": "^\\+?[1-9]\\d{1,14}$",
+                                "description": "Phone number in E.164 format."
+                            }
+                        },
+                        "required": ["type", "number"]
+                    },
+                    "description":
+                    "List of phone numbers associated with the user."
+                }
+            },
+            "required": ["firstName", "lastName"]
+        },
+        "address": {
+            "type": "object",
+            "properties": {
+                "street": {
+                    "type": "string",
+                    "description": "Street address."
+                },
+                "city": {
+                    "type": "string",
+                    "description": "City name."
+                },
+                "state": {
+                    "type": "string",
+                    "description": "State or province."
+                },
+                "postalCode": {
+                    "type": "string",
+                    "pattern": "^\\d{5}(-\\d{4})?$",
+                    "description": "Postal code."
+                },
+                "country": {
+                    "type": "string",
+                    "description": "Country name."
+                }
+            },
+            "required": ["street", "city", "state", "postalCode", "country"]
+        },
+        "preferences": {
+            "type": "object",
+            "properties": {
+                "newsletterSubscribed": {
+                    "type":
+                    "boolean",
+                    "description":
+                    "Indicates if the user is subscribed to the newsletter."
+                },
+                "favoriteCategories": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "List of user's favorite categories."
+                }
+            },
+            "required": ["newsletterSubscribed"]
+        },
+        "accountStatus": {
+            "type": "string",
+            "enum": ["active", "inactive", "suspended"],
+            "description": "Current status of the user's account."
+        },
+        "registrationDate": {
+            "type": "string",
+            "format": "date-time",
+            "description": "ISO 8601 formatted date-time of user registration."
+        }
+    },
+    "required":
+    ["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
+}
\ No newline at end of file
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
+include(FetchContent)
+
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_STANDARD 17)

 #
 # Define environment variables for special configurations
@@ -13,6 +16,12 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mf16c"
+    )
+endif()
 list(APPEND CXX_COMPILE_FLAGS
    "-fopenmp"
    "-DVLLM_CPU_EXTENSION")
@@ -49,6 +58,8 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
 find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    list(APPEND CXX_COMPILE_FLAGS
@@ -68,9 +79,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
    else()
        message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
    endif()
+    
 elseif (AVX2_FOUND)
    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
    message(WARNING "vLLM CPU backend using AVX2 ISA")
+    
 elseif (POWER9_FOUND OR POWER10_FOUND)
    message(STATUS "PowerPC detected")
    # Check for PowerPC VSX support
@@ -78,13 +91,55 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
        "-mvsx"
        "-mcpu=native"
        "-mtune=native")
+
+elseif (ASIMD_FOUND)
+    message(STATUS "ARMv8 or later architecture detected")
+    if(ARM_BF16_FOUND)
+        message(STATUS "BF16 extension detected")
+        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
+        add_compile_definitions(ARM_BF16_SUPPORT)
+    else()
+        message(WARNING "BF16 functionality is not available")
+        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
+endif()
+
+#
+# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
+#
+if (AVX512_FOUND AND NOT AVX512_DISABLED)
+    FetchContent_Declare(
+        oneDNN
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+        GIT_TAG  v3.6
+        GIT_PROGRESS TRUE
+        GIT_SHALLOW TRUE
+    )
+
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
+    set(ONEDNN_BUILD_DOC "OFF")
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
+    set(ONEDNN_BUILD_TESTS "OFF")
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+    set(ONEDNN_BUILD_GRAPH "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+    FetchContent_MakeAvailable(oneDNN)
+    
+    list(APPEND LIBS dnnl)
 endif()

 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

-list(APPEND LIBS dnnl numa)
+list(APPEND LIBS numa)

 #
 # _C extension
@@ -119,4 +174,4 @@ define_gpu_extension_target(
    WITH_SOABI
 )

-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -138,10 +138,181 @@ macro(string_to_ver OUT_VER IN_STR)
  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
 endmacro()

+#
+# Clear all `-gencode` flags from `CMAKE_CUDA_FLAGS` and store them in
+# `CUDA_ARCH_FLAGS`.
+#
+# Example:
+#   CMAKE_CUDA_FLAGS="-Wall -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75"
+#   clear_cuda_arches(CUDA_ARCH_FLAGS)
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_70,code=sm_70;-gencode arch=compute_75,code=sm_75"
+#   CMAKE_CUDA_FLAGS="-Wall"
+#
+macro(clear_cuda_arches CUDA_ARCH_FLAGS)
+    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+    string(REGEX MATCHALL "-gencode arch=[^ ]+" CUDA_ARCH_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+endmacro()
+
+#
+# Extract unique CUDA architectures from a list of compute capabilities codes in 
+# the form `<major><minor>[<letter>]`, convert them to the form sort 
+# `<major>.<minor>`, dedupes them and then sorts them in ascending order and 
+# stores them in `OUT_ARCHES`.
+#
+# Example:
+#   CUDA_ARCH_FLAGS="-gencode arch=compute_75,code=sm_75;...;-gencode arch=compute_90a,code=sm_90a" 
+#   extract_unique_cuda_archs_ascending(OUT_ARCHES CUDA_ARCH_FLAGS)
+#   OUT_ARCHES="7.5;...;9.0"
+function(extract_unique_cuda_archs_ascending OUT_ARCHES CUDA_ARCH_FLAGS)
+  set(_CUDA_ARCHES)
+  foreach(_ARCH ${CUDA_ARCH_FLAGS})
+    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+    if (_COMPUTE)
+      set(_COMPUTE ${CMAKE_MATCH_1})
+    endif()
+
+    string_to_ver(_COMPUTE_VER ${_COMPUTE})
+    list(APPEND _CUDA_ARCHES ${_COMPUTE_VER})
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHES)
+  list(SORT _CUDA_ARCHES COMPARE NATURAL ORDER ASCENDING)
+  set(${OUT_ARCHES} ${_CUDA_ARCHES} PARENT_SCOPE)
+endfunction()
+
+#
+# For a specific file set the `-gencode` flag in compile options conditionally 
+# for the CUDA language. 
+#
+# Example:
+#   set_gencode_flag_for_srcs(
+#     SRCS "foo.cu"
+#     ARCH "compute_75"
+#     CODE "sm_75")
+#   adds: "-gencode arch=compute_75,code=sm_75" to the compile options for 
+#    `foo.cu` (only for the CUDA language).
+#
+macro(set_gencode_flag_for_srcs)
+  set(options)
+  set(oneValueArgs ARCH CODE)
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+  set(_FLAG -gencode arch=${arg_ARCH},code=${arg_CODE})
+  set_property(
+    SOURCE ${arg_SRCS}
+    APPEND PROPERTY
+    COMPILE_OPTIONS "$<$<COMPILE_LANGUAGE:CUDA>:${_FLAG}>"
+  )
+
+  message(DEBUG "Setting gencode flag for ${arg_SRCS}: ${_FLAG}")
+endmacro(set_gencode_flag_for_srcs)
+
+#
+# For a list of source files set the `-gencode` flags in the files specific 
+#  compile options (specifically for the CUDA language).
+#
+# arguments are:
+#  SRCS: list of source files
+#  CUDA_ARCHS: list of CUDA architectures in the form `<major>.<minor>[letter]`
+#  BUILD_PTX_FOR_ARCH: if set to true, then the PTX code will be built
+#    for architecture `BUILD_PTX_FOR_ARCH` if there is a CUDA_ARCH in CUDA_ARCHS 
+#    that is larger than BUILD_PTX_FOR_ARCH.
+#
+macro(set_gencode_flags_for_srcs)
+  set(options)
+  set(oneValueArgs BUILD_PTX_FOR_ARCH)
+  set(multiValueArgs SRCS CUDA_ARCHS)
+  cmake_parse_arguments(arg "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+
+  foreach(_ARCH ${arg_CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    set_gencode_flag_for_srcs(
+      SRCS ${arg_SRCS}
+      ARCH "compute_${_ARCH}"
+      CODE "sm_${_ARCH}")
+  endforeach()
+
+  if (${arg_BUILD_PTX_FOR_ARCH})
+    list(SORT arg_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+    list(GET arg_CUDA_ARCHS -1 _HIGHEST_ARCH)
+    if (_HIGHEST_ARCH VERSION_GREATER_EQUAL ${arg_BUILD_PTX_FOR_ARCH})
+      string(REPLACE "." "" _PTX_ARCH "${arg_BUILD_PTX_FOR_ARCH}")
+      set_gencode_flag_for_srcs(
+        SRCS ${arg_SRCS}
+        ARCH "compute_${_PTX_ARCH}"
+        CODE "compute_${_PTX_ARCH}")
+    endif()
+  endif()
+endmacro()
+
+#
+# For the given `SRC_CUDA_ARCHS` list of gencode versions in the form 
+#  `<major>.<minor>[letter]` compute the "loose intersection" with the 
+#  `TGT_CUDA_ARCHS` list of gencodes. 
+# The loose intersection is defined as:
+#   { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} }
+#  where `<=` is the version comparison operator.
+# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
+#  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
+# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
+#  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
+#  9.0a to the result. 
+# The result is stored in `OUT_CUDA_ARCHS`.
+#
+# Example:
+#   SRC_CUDA_ARCHS="7.5;8.0;8.6;9.0;9.0a"
+#   TGT_CUDA_ARCHS="8.0;8.9;9.0"
+#   cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+#   OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a"
+#
+function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
+  list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
+
+  # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
+  # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
+  set(_CUDA_ARCHS)
+  if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
+    list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
+    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+      set(_CUDA_ARCHS "9.0a")
+    endif()
+  endif()
+
+  list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
+
+  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
+  # less or eqault to ARCH
+  foreach(_ARCH ${CUDA_ARCHS})
+  set(_TMP_ARCH)
+  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
+    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+      set(_TMP_ARCH ${_SRC_ARCH})
+    else()
+      break()
+    endif()
+  endforeach()
+  if (_TMP_ARCH)
+    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
+  endif()
+  endforeach()
+
+  list(REMOVE_DUPLICATES _CUDA_ARCHS)
+  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)
+endfunction()
+
 #
 # Override the GPU architectures detected by cmake/torch and filter them by
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
-# `GPU_ARCHES`.
+# `GPU_ARCHES`. This only applies to the HIP language since for CUDA we set 
+# the architectures on a per file basis.
 #
 # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
 #
@@ -179,109 +350,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
    endif()
-
-  elseif(${GPU_LANG} STREQUAL "CUDA")
-    #
-    # Setup/process CUDA arch flags.
-    #
-    # The torch cmake setup hardcodes the detected architecture flags in
-    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
-    # can't modified on a per-target basis.
-    # So, all the `-gencode` flags need to be extracted and removed from
-    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
-    # Since it's not possible to use `target_compiler_options` for adding target
-    # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
-    # must be used instead.  This requires repackaging the architecture flags
-    # into a format that cmake expects for `CUDA_ARCHITECTURES`.
-    #
-    # This is a bit fragile in that it depends on torch using `-gencode` as opposed
-    # to one of the other nvcc options to specify architectures.
-    #
-    # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
-    # detected architectures.
-    #
-    message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-
-    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
-    string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-
-    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
-    # and passed back via the `CUDA_ARCHITECTURES` property.
-    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
-      ${CMAKE_CUDA_FLAGS})
-
-    # If this error is triggered, it might mean that torch has changed how it sets
-    # up nvcc architecture code generation flags.
-    if (NOT _CUDA_ARCH_FLAGS)
-      message(FATAL_ERROR
-        "Could not find any architecture related code generation flags in "
-        "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
-    endif()
-
-    message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-    message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
-
-    # Initialize the architecture lists to empty.
-    set(${GPU_ARCHES})
-
-    # Process each `gencode` flag.
-    foreach(_ARCH ${_CUDA_ARCH_FLAGS})
-      # For each flag, extract the version number and whether it refers to PTX
-      # or native code.
-      # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
-      # for that match.
-
-      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
-      if (_COMPUTE)
-        set(_COMPUTE ${CMAKE_MATCH_1})
-      endif()
-
-      string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
-      if (_SM)
-        set(_SM ${CMAKE_MATCH_1})
-      endif()
-
-      string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
-      if (_CODE)
-        set(_CODE ${CMAKE_MATCH_1})
-      endif()
-
-      # Make sure the virtual architecture can be matched.
-      if (NOT _COMPUTE)
-        message(FATAL_ERROR
-          "Could not determine virtual architecture from: ${_ARCH}.")
-      endif()
-
-      # One of sm_ or compute_ must exist.
-      if ((NOT _SM) AND (NOT _CODE))
-        message(FATAL_ERROR
-          "Could not determine a codegen architecture from: ${_ARCH}.")
-      endif()
-
-      if (_SM)
-        # -real suffix let CMake to only generate elf code for the kernels.
-        # we want this, otherwise the added ptx (default) will increase binary size.
-        set(_VIRT "-real")
-        set(_CODE_ARCH ${_SM})
-      else()
-        # -virtual suffix let CMake to generate ptx code for the kernels.
-        set(_VIRT "-virtual")
-        set(_CODE_ARCH ${_CODE})
-      endif()
-
-      # Check if the current version is in the supported arch list.
-      string_to_ver(_CODE_VER ${_CODE_ARCH})
-      if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
-        message(STATUS "discarding unsupported CUDA arch ${_VER}.")
-        continue()
-      endif()
-
-      # Add it to the arch list.
-      list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
-    endforeach()
  endif()
-  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
 endmacro()

 #
@@ -360,11 +429,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
  # dependencies that are not necessary and may not be installed.
  if (GPU_LANGUAGE STREQUAL "CUDA")
-    if ("${CUDA_CUDA_LIB}" STREQUAL "")
-      set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
-    endif()
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
-      ${CUDA_LIBRARIES})
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
  else()
    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
  endif()

--- a/collect_env.py
+++ b/collect_env.py
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py

-# Unlike the rest of the PyTorch this file must be python2 compliant.
-# This script outputs relevant system environment info
-# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 import datetime
 import locale
 import os
 import re
 import subprocess
 import sys
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 from collections import namedtuple

+from vllm.envs import environment_variables
+
 try:
    import torch
    TORCH_AVAILABLE = True
@@ -52,6 +54,7 @@ SystemEnv = namedtuple(
        'vllm_version',  # vllm specific field
        'vllm_build_flags',  # vllm specific field
        'gpu_topo',  # vllm specific field
+        'env_vars',
    ])

 DEFAULT_CONDA_PATTERNS = {
@@ -267,13 +270,16 @@ def get_neuron_sdk_version(run_lambda):


 def get_vllm_version():
-    try:
-        import vllm
-        return vllm.__version__ + "@" + vllm.__commit__
-    except Exception:
-        # old version of vllm does not have __commit__
-        return 'N/A'
+    from vllm import __version__, __version_tuple__

+    if __version__ == "dev":
+        return "N/A (dev)"
+
+    if len(__version_tuple__) == 4: # dev build
+        git_sha = __version_tuple__[-1][1:] # type: ignore
+        return f"{__version__} (git sha: {git_sha}"
+
+    return __version__

 def summarize_vllm_build_flags():
    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
@@ -509,6 +515,22 @@ def is_xnnpack_available():
    else:
        return "N/A"

+def get_env_vars():
+    env_vars = ''
+    secret_terms=('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH",
+                     "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_",
+                     "NVIDIA")
+    for k, v in os.environ.items():
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if k in environment_variables:
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+        if k.startswith(report_prefix):
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+    return env_vars

 def get_env_info():
    run_lambda = run
@@ -580,6 +602,7 @@ def get_env_info():
        vllm_version=vllm_version,
        vllm_build_flags=vllm_build_flags,
        gpu_topo=gpu_topo,
+        env_vars=get_env_vars(),
    )


@@ -628,6 +651,8 @@ vLLM Build Flags:
 {vllm_build_flags}
 GPU Topology:
 {gpu_topo}
+
+{env_vars}
 """.strip()



--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -89,6 +89,48 @@ void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]

 namespace vllm {

+template <typename T>
+__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) {
+  const float f = (float)x;
+  return (T)(f > threshold ? f : 0.0f);
+}
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
+__global__ void act_and_mul_kernel_with_param(
+    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
+    const float param) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x, param) * y;
+  }
+}
+
+}  // namespace vllm
+
+#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
+  int d = input.size(-1) / 2;                                           \
+  int64_t num_tokens = input.numel() / input.size(-1);                  \
+  dim3 grid(num_tokens);                                                \
+  dim3 block(std::min(d, 1024));                                        \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));     \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();         \
+  VLLM_DISPATCH_FLOATING_TYPES(                                         \
+      input.scalar_type(), "act_and_mul_kernel_with_param", [&] {       \
+        vllm::act_and_mul_kernel_with_param<scalar_t, KERNEL<scalar_t>> \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),      \
+                                         input.data_ptr<scalar_t>(), d, \
+                                         PARAM);                        \
+      });
+
+void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
+                     torch::Tensor& input,  // [..., 2 * d]
+                     double threshold) {
+  LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
+}
+namespace vllm {
+
 // Element-wise activation kernel template.
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 __global__ void activation_kernel(

--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -300,7 +300,7 @@ __device__ void paged_attention_kernel(
        // Store the partial reductions to shared memory.
        // NOTE(woosuk): It is required to zero out the masked logits.
        const bool mask = token_idx >= seq_len;
-        logits[token_idx - start_token_idx] = mask ? 0.f: qk;
+        logits[token_idx - start_token_idx] = mask ? 0.f : qk;
        // Update the max value.
        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
      }
@@ -670,338 +670,6 @@ __global__ void paged_attention_v2_reduce_kernel(

 }  // namespace vllm

-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
-  VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                     \
-      ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE,        \
-                                              BLOCK_SIZE, NUM_THREADS,      \
-                                              KV_DTYPE, IS_BLOCK_SPARSE>),  \
-      shared_mem_size);                                                     \
-  vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,        \
-                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE>   \
-      <<<grid, block, shared_mem_size, stream>>>(                           \
-          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
-          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
-          k_scale, v_scale, tp_rank, blocksparse_local_blocks,              \
-          blocksparse_vert_stride, blocksparse_block_size,                  \
-          blocksparse_head_sliding_step);
-
-// TODO(woosuk): Tune NUM_THREADS.
-template <typename T, typename CACHE_T, int BLOCK_SIZE,
-          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
-          int NUM_THREADS = 128>
-void paged_attention_v1_launcher(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
-    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-
-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
-  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int padded_max_seq_len =
-      DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
-  int logits_size = padded_max_seq_len * sizeof(float);
-  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
-  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
-  // Keep that in sync with the logic here!
-  int shared_mem_size = std::max(logits_size, outputs_size);
-
-  dim3 grid(num_heads, num_seqs, 1);
-  dim3 block(NUM_THREADS);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  switch (head_size) {
-    // NOTE(woosuk): To reduce the compilation time, we only compile for the
-    // head sizes that we use in the model. However, we can easily extend this
-    // to support any head size which is a multiple of 16.
-    case 64:
-      LAUNCH_PAGED_ATTENTION_V1(64);
-      break;
-    case 80:
-      LAUNCH_PAGED_ATTENTION_V1(80);
-      break;
-    case 96:
-      LAUNCH_PAGED_ATTENTION_V1(96);
-      break;
-    case 112:
-      LAUNCH_PAGED_ATTENTION_V1(112);
-      break;
-    case 120:
-      LAUNCH_PAGED_ATTENTION_V1(120);
-      break;
-    case 128:
-      LAUNCH_PAGED_ATTENTION_V1(128);
-      break;
-    case 160:
-      LAUNCH_PAGED_ATTENTION_V1(160);
-      break;
-    case 192:
-      LAUNCH_PAGED_ATTENTION_V1(192);
-      break;
-    case 256:
-      LAUNCH_PAGED_ATTENTION_V1(256);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)  \
-  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
-                              IS_BLOCK_SPARSE>(                              \
-      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
-      blocksparse_local_blocks, blocksparse_vert_stride,                     \
-      blocksparse_block_size, blocksparse_head_sliding_step);
-
-#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
-  }
-
-// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
-  switch (block_size) {                                           \
-    case 8:                                                       \
-      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
-      break;                                                      \
-    case 16:                                                      \
-      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
-      break;                                                      \
-    case 32:                                                      \
-      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-
-void paged_attention_v1(
-    torch::Tensor& out,    // [num_seqs, num_heads, head_size]
-    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
-    torch::Tensor&
-        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
-    torch::Tensor&
-        value_cache,       // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads,  // [num_heads]
-    double scale,
-    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    torch::Tensor& seq_lens,      // [num_seqs]
-    int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  const bool is_block_sparse = (blocksparse_vert_stride > 1);
-
-  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
-                             CALL_V1_LAUNCHER_BLOCK_SIZE)
-}
-
-#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                   \
-  vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,           \
-                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE,      \
-                                  PARTITION_SIZE>                              \
-      <<<grid, block, shared_mem_size, stream>>>(                              \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
-          value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
-          seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
-          kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,          \
-          blocksparse_local_blocks, blocksparse_vert_stride,                   \
-          blocksparse_block_size, blocksparse_head_sliding_step);              \
-  vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
-                                         PARTITION_SIZE>                       \
-      <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                \
-          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr,    \
-          max_num_partitions);
-
-template <typename T, typename CACHE_T, int BLOCK_SIZE,
-          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
-          int NUM_THREADS = 128, int PARTITION_SIZE = 512>
-void paged_attention_v2_launcher(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
-    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-
-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
-  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
-  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
-  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
-  int logits_size = PARTITION_SIZE * sizeof(float);
-  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
-
-  // For paged attention v2 kernel.
-  dim3 grid(num_heads, num_seqs, max_num_partitions);
-  int shared_mem_size = std::max(logits_size, outputs_size);
-  // For paged attention v2 reduce kernel.
-  dim3 reduce_grid(num_heads, num_seqs);
-  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
-
-  dim3 block(NUM_THREADS);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  switch (head_size) {
-    // NOTE(woosuk): To reduce the compilation time, we only compile for the
-    // head sizes that we use in the model. However, we can easily extend this
-    // to support any head size which is a multiple of 16.
-    case 64:
-      LAUNCH_PAGED_ATTENTION_V2(64);
-      break;
-    case 80:
-      LAUNCH_PAGED_ATTENTION_V2(80);
-      break;
-    case 96:
-      LAUNCH_PAGED_ATTENTION_V2(96);
-      break;
-    case 112:
-      LAUNCH_PAGED_ATTENTION_V2(112);
-      break;
-    case 120:
-      LAUNCH_PAGED_ATTENTION_V2(120);
-      break;
-    case 128:
-      LAUNCH_PAGED_ATTENTION_V2(128);
-      break;
-    case 160:
-      LAUNCH_PAGED_ATTENTION_V2(160);
-      break;
-    case 192:
-      LAUNCH_PAGED_ATTENTION_V2(192);
-      break;
-    case 256:
-      LAUNCH_PAGED_ATTENTION_V2(256);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)   \
-  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,               \
-                              IS_BLOCK_SPARSE>(                               \
-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
-      num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
-      k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
-      blocksparse_vert_stride, blocksparse_block_size,                        \
-      blocksparse_head_sliding_step);
-
-#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
-  }
-
-// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
-  switch (block_size) {                                           \
-    case 8:                                                       \
-      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
-      break;                                                      \
-    case 16:                                                      \
-      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
-      break;                                                      \
-    case 32:                                                      \
-      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-
-void paged_attention_v2(
-    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
-    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
-    torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
-    torch::Tensor&
-        tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
-    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
-    torch::Tensor&
-        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
-    torch::Tensor&
-        value_cache,       // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads,  // [num_heads]
-    double scale,
-    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    torch::Tensor& seq_lens,      // [num_seqs]
-    int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  const bool is_block_sparse = (blocksparse_vert_stride > 1);
-  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
-                             CALL_V2_LAUNCHER_BLOCK_SIZE)
-}
-
 #undef WARP_SIZE
 #undef MAX
 #undef MIN

--- a/csrc/attention/attention_kernels_opt.cu
+++ b/csrc/attention/attention_kernels_opt.cu
@@ -873,13 +873,10 @@ void paged_attention_v1_launcher(
      blocksparse_block_size, blocksparse_head_sliding_step);

 #define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
+  if (is_block_sparse) {                                                   \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
  }

 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
@@ -1020,13 +1017,10 @@ void paged_attention_v2_launcher(
      blocksparse_head_sliding_step);

 #define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
+  if (is_block_sparse) {                                                   \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
  }

 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes