Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -8,11 +8,11 @@ from tabulate import tabulate

 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    create_kv_caches_with_random,
+    set_random_seed,
 )

 logger = init_logger(__name__)
@@ -36,7 +36,7 @@ def run_benchmark(
    if kv_cache_dtype == "fp8" and head_size % 16:
        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")

-    current_platform.seed_everything(42)
+    set_random_seed(42)
    torch.set_default_device(device)

    # create random key / value tensors [T, H, D].

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -7,15 +7,15 @@ import torch
 from tabulate import tabulate

 from vllm import _custom_ops as ops
-from vllm.attention.ops.triton_reshape_and_cache_flash import (
-    triton_reshape_and_cache_flash,
-)
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
    create_kv_caches_with_random_flash,
+    set_random_seed,
+)
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash,
 )

 logger = init_logger(__name__)
@@ -49,7 +49,7 @@ def run_benchmark(
    if implementation == "triton" and kv_cache_layout == "HND":
        return float("nan")  # Triton does not support HND layout yet.

-    current_platform.seed_everything(42)
+    set_random_seed(42)
    torch.set_default_device(device)

    # create random key / value tensors [T, H, D].

--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -23,9 +23,9 @@ import torch
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    persistent_masked_m_silu_mul_quant,
 )
-from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+from vllm.utils.torch_utils import set_random_seed


 @triton.jit
@@ -207,7 +207,7 @@ def benchmark(
 ):
    def generate_data(seed_offset=0):
        """Generate input data with given seed offset"""
-        current_platform.seed_everything(42 + seed_offset)
+        set_random_seed(42 + seed_offset)
        y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()

        if gen_strategy == "random_imbalanced":

--- a/benchmarks/kernels/cpu/benchmark_cpu_attn.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+import time
+
+import numpy as np
+import torch
+
+from vllm._custom_ops import (
+    cpu_attention_with_kv_cache,
+    cpu_attn_get_scheduler_metadata,
+    cpu_attn_reshape_and_cache,
+)
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa
+
+
+def get_attn_isa(
+    block_size: int | None = None,
+    dtype: torch.dtype | None = None,
+):
+    if block_size and dtype:
+        return _get_attn_isa(dtype, block_size)
+    else:
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        elif torch._C._cpu._is_amx_tile_supported():
+            return "amx"
+        else:
+            return "vec"
+
+
+# rand number generation takes too much time, cache rand tensors
+@functools.lru_cache(maxsize=128, typed=False)
+def tensor_cache(
+    elem_num: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    tensor = torch.randn(elem_num, dtype=dtype)
+    return tensor
+
+
+@torch.inference_mode()
+def main(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int = None,
+    dtype: torch.dtype = torch.bfloat16,
+    block_size: int = 128,
+    num_blocks: int = 4096,
+    use_sink: bool = False,
+    enable_kv_split: bool = False,
+    isa: str | None = None,
+    seed: int = 0,
+    iters: int = 20,
+) -> None:
+    current_platform.seed_everything(seed)
+    num_seqs = len(seq_lens)
+    query_lens = [x[0] for x in seq_lens]
+    kv_lens = [x[1] for x in seq_lens]
+    num_query_heads = num_heads[0]
+    num_kv_heads = num_heads[1]
+    assert num_query_heads % num_kv_heads == 0
+    max_kv_len = max(kv_lens)
+    window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1)
+    scale = head_size**-0.5
+    token_num = sum(query_lens)
+
+    if isa is None:
+        isa = get_attn_isa(block_size, dtype)
+
+    s_aux = (
+        15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None
+    )
+
+    query = tensor_cache(
+        elem_num=token_num * num_query_heads * head_size,
+        dtype=dtype,
+    )
+    query = query.view(
+        token_num,
+        num_query_heads,
+        head_size,
+    )
+
+    key_value = tensor_cache(
+        elem_num=2 * num_blocks * num_kv_heads * block_size * head_size,
+        dtype=dtype,
+    )
+    key_value = key_value.view(
+        2,
+        num_blocks,
+        block_size,
+        num_kv_heads,
+        head_size,
+    )
+    key_cache, value_cache = key_value.unbind(0)
+
+    # KV cache for CPU attention
+    packed_key_cache = torch.empty(
+        num_blocks, num_kv_heads, block_size, head_size, dtype=dtype
+    )
+    packed_value_cache = torch.empty_like(packed_key_cache)
+
+    cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(
+        dim=0, dtype=torch.int32
+    )
+    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
+    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    block_tables = torch.randint(
+        0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+    )
+
+    # use reshape_and_cache to pack key_cache and value_cache
+    slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64)
+    cpu_attn_reshape_and_cache(
+        key=key_cache.view(-1, num_kv_heads, head_size),
+        value=value_cache.view(-1, num_kv_heads, head_size),
+        key_cache=packed_key_cache,
+        value_cache=packed_value_cache,
+        slot_mapping=slot_mapping,
+        isa=isa,
+    )
+
+    metadata = cpu_attn_get_scheduler_metadata(
+        num_reqs=num_seqs,
+        num_heads=num_query_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_size,
+        seq_lens=kv_lens_tensor,
+        dtype=dtype,
+        query_start_loc=cu_query_lens,
+        causal=True,
+        sliding_window_size=sliding_window if sliding_window is not None else -1,
+        isa=isa,
+        enable_kv_split=enable_kv_split,
+    )
+
+    out_with_split = torch.empty_like(query)
+
+    def run_benchmark(iters: int) -> list[float]:
+        times = []
+        for _ in range(iters):
+            start_time = time.perf_counter_ns()
+            cpu_attention_with_kv_cache(
+                query=query,
+                key_cache=packed_key_cache,
+                value_cache=packed_value_cache,
+                output=out_with_split,
+                query_start_loc=cu_query_lens,
+                seq_lens=kv_lens_tensor,
+                scale=scale,
+                causal=True,
+                alibi_slopes=None,
+                sliding_window=window_size,
+                block_table=block_tables,
+                softcap=0,
+                scheduler_metadata=metadata,
+                s_aux=s_aux,
+            )
+            end_time = time.perf_counter_ns()
+            times.append((end_time - start_time) / 1e6)
+        return times
+
+    # warmup
+    run_benchmark(5)
+    # benchmark
+    times = run_benchmark(iters)
+
+    time_min = min(times)
+    time_max = max(times)
+    time_mean = np.mean(times)
+    time_std = np.std(times)
+
+    print("\tmin (ms) = ", time_min)
+    print("\tmax (ms) = ", time_max)
+    print("\tmean (ms) = ", time_mean)
+    print("\tstd = ", time_std)
+    print("\tmedian (ms) = ", np.median(times))
+
+
+def generate_seq_lens(
+    batch_size: int,
+    q_len_min: int,
+    q_len_max: int,
+    kv_len_min: int,
+    kv_len_max: int,
+    seed: int = 0,
+) -> list[tuple[int, int]]:
+    assert 1 <= q_len_min <= q_len_max
+    assert 1 <= kv_len_min <= kv_len_max
+    assert kv_len_max >= q_len_min
+
+    g = torch.Generator(device="cpu").manual_seed(seed)
+
+    def rint(lo: int, hi: int) -> int:
+        return torch.randint(lo, hi + 1, (1,), generator=g).item()
+
+    seq_lens: list[tuple[int, int]] = []
+    for _ in range(batch_size):
+        # ensure q <= kv
+        kv = rint(max(kv_len_min, q_len_min), kv_len_max)
+        q = rint(q_len_min, min(q_len_max, kv))
+        seq_lens.append((q, kv))
+
+    return seq_lens
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.")
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument("--q-len-min", type=int, default=512)
+    parser.add_argument("--q-len-max", type=int, default=512)
+    parser.add_argument("--kv-len-min", type=int, default=512)
+    parser.add_argument("--kv-len-max", type=int, default=512)
+    parser.add_argument("--num-blocks", type=int, default=4096)
+
+    parser.add_argument("--sliding-window", type=int, default=None)
+    parser.add_argument("--num-query-heads", type=int, default=32)
+    parser.add_argument("--num-kv-heads", type=int, default=8)
+    parser.add_argument(
+        "--head-size",
+        type=int,
+        choices=CPUAttentionBackend.get_supported_head_sizes(),
+        default=128,
+    )
+    parser.add_argument("--enable-kv-split", action="store_true")
+    parser.add_argument("--block-size", type=int, choices=[32, 64, 128], default=128)
+    parser.add_argument(
+        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
+    )
+    parser.add_argument("--use-sink", action="store_true")
+    parser.add_argument(
+        "--isa", type=str, choices=["vec", "neon", "amx", "vec16"], default=None
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--iters", type=int, default=20)
+
+    args = parser.parse_args()
+    print(args)
+
+    seq_lens = generate_seq_lens(
+        args.batch_size,
+        args.q_len_min,
+        args.q_len_max,
+        args.kv_len_min,
+        args.kv_len_max,
+        args.seed,
+    )
+
+    print("batch (query len, kv len) = ", seq_lens)
+
+    main(
+        seq_lens=seq_lens,
+        num_heads=(args.num_query_heads, args.num_kv_heads),
+        head_size=args.head_size,
+        sliding_window=args.sliding_window,
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+        block_size=args.block_size,
+        num_blocks=args.num_blocks,
+        use_sink=args.use_sink,
+        enable_kv_split=args.enable_kv_split,
+        isa=args.isa
+        if args.isa is not None
+        else get_attn_isa(args.block_size, STR_DTYPE_TO_TORCH_DTYPE[args.dtype]),
+        seed=args.seed,
+        iters=args.iters,
+    )
--- a/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
+++ b/benchmarks/kernels/cpu/benchmark_cpu_fused_moe.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import sys
+import time
+
+import numpy as np
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+# Check if CPU MoE operations are available
+try:
+    from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
+except (ImportError, AttributeError) as e:
+    print("ERROR: CPU fused MoE operations are not available on this platform.")
+    print("This benchmark requires x86 CPU with proper vLLM CPU extensions compiled.")
+    print(
+        "The cpu_fused_moe kernel is typically available on Linux x86_64 "
+        "with AVX2/AVX512."
+    )
+    print(f"Import error: {e}")
+    sys.exit(1)
+
+# ISA selection following test_cpu_fused_moe.py pattern
+ISA_CHOICES = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
+
+
+@torch.inference_mode()
+def main(
+    batch_size: int,
+    expert_num: int,
+    hidden_size: int,
+    intermediate_size: int,
+    topk_num: int,
+    use_bias: bool = False,
+    dtype: torch.dtype = torch.bfloat16,
+    activation: str = "silu",
+    isa: str = "vec",
+    seed: int = 0,
+    iters: int = 20,
+) -> None:
+    current_platform.seed_everything(seed)
+    # up_dim = 2 * intermediate_size for gate + up projection
+    up_dim = 2 * intermediate_size
+
+    input_tensor = torch.randn((batch_size, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+
+    w13 = torch.randn((expert_num, up_dim, hidden_size), dtype=dtype) / (
+        0.5 * hidden_size**0.5
+    )
+    w2 = torch.randn((expert_num, hidden_size, intermediate_size), dtype=dtype) / (
+        0.5 * intermediate_size**0.5
+    )
+
+    w13_bias = None
+    w2_bias = None
+    if use_bias:
+        w13_bias = torch.randn((expert_num, up_dim), dtype=dtype) / (0.5 * up_dim**0.5)
+        w2_bias = torch.randn((expert_num, hidden_size), dtype=dtype) / (
+            0.5 * hidden_size**0.5
+        )
+
+    router_logits = torch.randn((batch_size, expert_num), dtype=dtype)
+    score = torch.softmax(router_logits, dim=-1, dtype=torch.float32)
+    topk_weights, topk_ids = torch.topk(score, topk_num)
+    topk_ids = topk_ids.to(torch.int32)
+
+    packed_w13 = cpu_prepack_moe_weight(w13, isa)
+    packed_w2 = cpu_prepack_moe_weight(w2, isa)
+
+    def run_benchmark(iters: int) -> list[float]:
+        times = []
+        for _ in range(iters):
+            start_time = time.perf_counter_ns()
+            _ = cpu_fused_moe(
+                input_tensor,
+                packed_w13,
+                packed_w2,
+                w13_bias,
+                w2_bias,
+                topk_weights,
+                topk_ids,
+                activation,
+                isa,
+            )
+            end_time = time.perf_counter_ns()
+            times.append((end_time - start_time) / 1e6)
+        return times
+
+    # warmup
+    run_benchmark(5)
+    # benchmark
+    times = run_benchmark(iters)
+
+    if not times:
+        print("No iterations to measure. Set --iters > 0.")
+        return
+
+    time_min = min(times)
+    time_max = max(times)
+    time_mean = np.mean(times)
+    time_std = np.std(times)
+
+    print("\tmin (ms) = ", time_min)
+    print("\tmax (ms) = ", time_max)
+    print("\tmean (ms) = ", time_mean)
+    print("\tstd = ", time_std)
+    print("\tmedian (ms) = ", np.median(times))
+
+    # Calculate throughput metrics
+    # FLOPs estimation: 2 * batch * topk * (hidden * up_dim + intermediate * hidden)
+    flops_per_token = (
+        2 * topk_num * (hidden_size * up_dim + intermediate_size * hidden_size)
+    )
+    total_flops = batch_size * flops_per_token
+    tflops = total_flops / (time_mean * 1e-3) / 1e12
+    print(f"\tthroughput (TFLOP/s) = {tflops:.4f}")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the CPU fused MoE kernel.")
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument("--expert-num", type=int, default=8)
+    parser.add_argument("--hidden-size", type=int, default=2880)
+    parser.add_argument("--intermediate-size", type=int, default=2880)
+    parser.add_argument(
+        "--topk-num",
+        type=int,
+        default=None,
+        help="Number of experts to route each token to (default: expert_num // 2)",
+    )
+    parser.add_argument("--use-bias", action="store_true")
+    parser.add_argument(
+        "--activation",
+        type=str,
+        choices=["silu", "swigluoai"],
+        default="silu",
+        help="Activation function",
+    )
+    parser.add_argument(
+        "--isa",
+        type=str,
+        choices=ISA_CHOICES,
+        default=ISA_CHOICES[0],
+        help=f"ISA to use (available: {ISA_CHOICES})",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--iters", type=int, default=20)
+
+    args = parser.parse_args()
+
+    # Default topk_num to expert_num // 2, minimum 1
+    topk_num = (
+        args.topk_num if args.topk_num is not None else max(args.expert_num // 2, 1)
+    )
+
+    print(args)
+
+    main(
+        batch_size=args.batch_size,
+        expert_num=args.expert_num,
+        hidden_size=args.hidden_size,
+        intermediate_size=args.intermediate_size,
+        topk_num=topk_num,
+        use_bias=args.use_bias,
+        dtype=torch.bfloat16,  # Following test_cpu_fused_moe.py
+        activation=args.activation,
+        isa=args.isa,
+        seed=args.seed,
+        iters=args.iters,
+    )
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -330,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        PUBLIC ${oneDNN_BINARY_DIR}/include
        PRIVATE ${oneDNN_SOURCE_DIR}/src
    )
-    target_link_libraries(dnnl_ext dnnl)
+    target_link_libraries(dnnl_ext dnnl torch)
    target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
    list(APPEND LIBS dnnl_ext)
    set(USE_ONEDNN ON)
@@ -358,13 +358,13 @@ set(VLLM_EXT_SRC
    "csrc/cpu/pos_encoding.cpp"
    "csrc/moe/dynamic_4bit_int_moe_cpu.cpp"
    "csrc/cpu/cpu_attn.cpp"
-    "csrc/cpu/scratchpad_manager.cpp"
    "csrc/cpu/torch_bindings.cpp")

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/shm.cpp"
        "csrc/cpu/cpu_wna16.cpp"
+        "csrc/cpu/cpu_fused_moe.cpp"
        ${VLLM_EXT_SRC})
    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
        set(VLLM_EXT_SRC

--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 # sm90a

 set(SUPPORT_ARCHS)
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
-    list(APPEND SUPPORT_ARCHS 9.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
+    list(APPEND SUPPORT_ARCHS "9.0a")
 endif()
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
-    list(APPEND SUPPORT_ARCHS 10.0a)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
+    # CUDA 12.9 has introduced "Family-Specific Architecture Features"
+    # this supports all compute_10x family
+    list(APPEND SUPPORT_ARCHS "10.0f")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+    list(APPEND SUPPORT_ARCHS "10.0a")
 endif()


 cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
 if(FLASH_MLA_ARCHS)
+    message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
    set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
    list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")

@@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS)
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
 else()
-    # Create empty targets for setup.py when not targeting sm90a systems
+    message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
+    # Create empty targets for setup.py on unsupported systems
    add_custom_target(_flashmla_C)
    add_custom_target(_flashmla_extension_C)
 endif()

--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@@ -31,10 +31,15 @@ if(NOT qutlass_SOURCE_DIR)
 endif()
 message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}")

-cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}")
-if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS)
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+  cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0f" "${CUDA_ARCHS}")
+else()
+  cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a;10.3a" "${CUDA_ARCHS}")
+endif()
+
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND QUTLASS_ARCHS)

-  if(QUTLASS_ARCHS MATCHES "10\\.0a")
+  if(QUTLASS_ARCHS MATCHES "10\\.(0a|3a|0f)")
    set(QUTLASS_TARGET_CC 100)
  elseif(QUTLASS_ARCHS MATCHES "12\\.0a")
    set(QUTLASS_TARGET_CC 120)

--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
+          GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -15,19 +15,61 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x,
                                            const scalar_t& y) {
  return act_first ? ACT_FN(x) * y : x * ACT_FN(y);
 }
-// Activation and gating kernel template.

+// Check if all pointers are 16-byte aligned for int4 vectorized access
+__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) {
+  return (reinterpret_cast<uintptr_t>(ptr) & 15) == 0;
+}
+
+// Activation and gating kernel template.
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
          bool act_first>
 __global__ void act_and_mul_kernel(
    scalar_t* __restrict__ out,          // [..., d]
    const scalar_t* __restrict__ input,  // [..., 2, d]
    const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access.
+  // All three pointers must be 16-byte aligned for safe int4 operations.
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = compute<scalar_t, ACT_FN, act_first>(xp[j], yp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = compute<scalar_t, ACT_FN, act_first>(VLLM_LDG(&x_ptr[i]),
+                                                        VLLM_LDG(&y_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = compute<scalar_t, ACT_FN, act_first>(x, y);
+    }
  }
 }

@@ -120,50 +162,115 @@ template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&, const float)>
 __global__ void act_and_mul_kernel_with_param(
    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d,
    const float param) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
-    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x, param) * y;
+  const scalar_t* x_ptr = input + token_idx * 2 * d;
+  const scalar_t* y_ptr = x_ptr + d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) &&
+                       is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* x_vec = reinterpret_cast<const int4*>(x_ptr);
+    const int4* y_vec = reinterpret_cast<const int4*>(y_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r;
+      auto* xp = reinterpret_cast<scalar_t*>(&x);
+      auto* yp = reinterpret_cast<scalar_t*>(&y);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(xp[j], param) * yp[j];
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&x_ptr[idx]);
+      const scalar_t y = VLLM_LDG(&y_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x, param) * y;
+    }
  }
 }

 template <typename T>
 __device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
                                               float alpha, float limit) {
-  // clamp gate: min=None, max=limit
-  const float gate_f = (float)gate;
-  const float clamped_gate = gate_f > limit ? limit : gate_f;
-
-  // clamp up: min=-limit, max=limit
-  const float up_f = (float)up;
-  const float clamped_up =
-      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
-  // glu = gate * sigmoid(gate * alpha)
-  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
-  const float glu = clamped_gate * sigmoid_val;
-
-  // (up + 1) * glu
-  return (T)((clamped_up + 1.0f) * glu);
+  // Clamp gate to (-inf, limit] and up to [-limit, limit]
+  const float g = fminf((float)gate, limit);
+  const float u = fmaxf(fminf((float)up, limit), -limit);
+  // glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu
+  return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha)));
 }

+// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...].
 template <typename scalar_t,
          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
                             const float)>
 __global__ void swigluoai_and_mul_kernel(
    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const scalar_t* __restrict__ input,  // [..., 2 * d] (interleaved)
    const int d, const float alpha, const float limit) {
+  // For interleaved data: input has 2*d elements per token (gate/up pairs)
+  // output has d elements per token
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
+  constexpr int PAIRS = VEC_SIZE / 2;  // Number of gate/up pairs per int4 load
  const int64_t token_idx = blockIdx.x;
-  // TODO: Vectorize loads and stores.
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    // gate = x[..., ::2]  (even indices)
-    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
-    // up = x[..., 1::2]   (odd indices)
-    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
-
-    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  const scalar_t* in_ptr = input + token_idx * 2 * d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access on input.
+  // For output we use int2 (64-bit) which has 8-byte alignment requirement.
+  const bool in_aligned = is_16byte_aligned(in_ptr);
+  const bool out_aligned =
+      (reinterpret_cast<uintptr_t>(out_ptr) & 7) == 0;  // 8-byte for int2
+
+  if (in_aligned && out_aligned && d >= PAIRS) {
+    // Fast path: vectorized loop
+    // Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs
+    // Each int2 store writes PAIRS output elements
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int2* out_vec = reinterpret_cast<int2*>(out_ptr);
+    const int num_vecs = d / PAIRS;
+    const int vec_end = num_vecs * PAIRS;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]);
+      int2 r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < PAIRS; j++) {
+        rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]),
+                          VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit);
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      // gate = x[..., ::2]  (even indices)
+      const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]);
+      // up = x[..., 1::2]   (odd indices)
+      const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]);
+      out_ptr[idx] = ACT_FN(gate, up, alpha, limit);
+    }
  }
 }

@@ -217,10 +324,41 @@ __global__ void activation_kernel(
    scalar_t* __restrict__ out,          // [..., d]
    const scalar_t* __restrict__ input,  // [..., d]
    const int d) {
+  constexpr int VEC_SIZE = 16 / sizeof(scalar_t);
  const int64_t token_idx = blockIdx.x;
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
-    out[token_idx * d + idx] = ACT_FN(x);
+  const scalar_t* in_ptr = input + token_idx * d;
+  scalar_t* out_ptr = out + token_idx * d;
+
+  // Check alignment for 128-bit vectorized access
+  const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr);
+
+  if (aligned && d >= VEC_SIZE) {
+    // Fast path: 128-bit vectorized loop
+    const int4* in_vec = reinterpret_cast<const int4*>(in_ptr);
+    int4* out_vec = reinterpret_cast<int4*>(out_ptr);
+    const int num_vecs = d / VEC_SIZE;
+    const int vec_end = num_vecs * VEC_SIZE;
+
+    for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) {
+      int4 v = VLLM_LDG(&in_vec[i]), r;
+      auto* vp = reinterpret_cast<scalar_t*>(&v);
+      auto* rp = reinterpret_cast<scalar_t*>(&r);
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        rp[j] = ACT_FN(vp[j]);
+      }
+      out_vec[i] = r;
+    }
+    // Scalar cleanup for remaining elements
+    for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) {
+      out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i]));
+    }
+  } else {
+    // Scalar fallback for unaligned data or small d
+    for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const scalar_t x = VLLM_LDG(&in_ptr[idx]);
+      out_ptr[idx] = ACT_FN(x);
+    }
  }
 }


--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -9,16 +9,6 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping);

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping);
-
-void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
-                     const torch::Tensor& block_mapping);
-
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
@@ -43,6 +33,13 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                          const std::string& kv_cache_dtype,
                          torch::Tensor& scale);

+// NOTE: k_pe and kv_c order is flipped compared to concat_and_cache_mla
+void concat_and_cache_mla_rope_fused(
+    torch::Tensor& positions, torch::Tensor& q_pe, torch::Tensor& k_pe,
+    torch::Tensor& kv_c, torch::Tensor& rope_cos_sin_cache, bool rope_is_neox,
+    torch::Tensor& kv_cache_slot_mapping, torch::Tensor& kv_cache,
+    const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);

--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -124,94 +124,6 @@ __global__ void copy_blocks_mla_kernel(

 }  // namespace vllm

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  int num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
-    return;
-  }
-  torch::Device cache_device = key_caches[0].device();
-  TORCH_CHECK(cache_device.is_cuda());
-
-  // Create data structures for the kernel.
-  // Create an array of pointers to the key and value caches.
-  int64_t key_cache_ptrs[num_layers];
-  int64_t value_cache_ptrs[num_layers];
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    key_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
-    value_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
-  }
-
-  // block_mapping is a 2D tensor with shape (num_pairs, 2).
-  int num_pairs = block_mapping.size(0);
-
-  // Move the data structures to the GPU.
-  // NOTE: This synchronizes the CPU and GPU.
-  torch::Tensor key_cache_ptrs_tensor =
-      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
-          .to(cache_device);
-  torch::Tensor value_cache_ptrs_tensor =
-      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
-          .to(cache_device);
-
-  // Launch the kernel.
-  const int numel_per_block = key_caches[0][0].numel();
-  dim3 grid(num_layers, num_pairs);
-  dim3 block(std::min(1024, numel_per_block));
-  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
-        vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            key_cache_ptrs_tensor.data_ptr<int64_t>(),
-            value_cache_ptrs_tensor.data_ptr<int64_t>(),
-            block_mapping.data_ptr<int64_t>(), numel_per_block);
-      }));
-}
-
-// copy blocks kernel for MLA (assumes a joint KV-cache)
-void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
-                     const torch::Tensor& block_mapping) {
-  int num_layers = kv_caches.size();
-  if (num_layers == 0) {
-    return;
-  }
-  torch::Device cache_device = kv_caches[0].device();
-  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
-
-  std::vector<int64_t> cache_ptrs(num_layers);
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
-  }
-  torch::Tensor cache_ptrs_tensor =
-      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
-          .to(cache_device);
-
-  int num_pairs = block_mapping.size(0);
-  // We use the stride instead of numel in case the cache is padded for memory
-  // alignment reasons, we assume the blocks data (inclusive of any padding)
-  // is contiguous in memory
-  int mem_footprint_per_block = kv_caches[0].stride(0);
-  dim3 grid(num_layers, num_pairs);
-  dim3 block(std::min(1024, mem_footprint_per_block));
-  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
-        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            cache_ptrs_tensor.data_ptr<int64_t>(),
-            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
-      }));
-}
-
 namespace vllm {

 // Used to copy/convert one element
@@ -770,9 +682,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
  for (int i = 0; i < VEC_SIZE; i++) {
    amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
  }
-#ifndef USE_ROCM
-  __syncwarp();
-#endif

  // Reduced amax
  for (int mask = 16; mask > 0; mask /= 2) {
@@ -782,9 +691,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
    amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
 #endif
  }
-#ifndef USE_ROCM
-  __syncwarp();
-#endif
+
 #if defined(__gfx942__)
  float scale = fmaxf(amax, 1e-4) / 224.0f;
 #else

--- a/csrc/cache_kernels_fused.cu
+++ b/csrc/cache_kernels_fused.cu
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+#include "quantization/w8a8/fp8/common.cuh"
+#ifdef USE_ROCM
+  #include "quantization/w8a8/fp8/amd/quant_utils.cuh"
+#else
+  #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh"
+#endif
+
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+typedef __hip_bfloat16 __nv_bfloat16;
+#endif
+
+namespace vllm {
+
+// NOTE Be EXTRA careful with raw_kv_scalar_t, for __half and __nv_bfloat16 it's
+// using u16 as the backing type.
+template <typename qk_t, bool IS_NEOX, typename raw_kv_scalar_t,
+          typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void concat_and_cache_mla_rope_fused_kernel(
+    const int64_t* __restrict__ positions,  // [num_tokens]
+    qk_t* __restrict__ q_pe,        // [num_tokens, num_q_heads, rot_dim]
+    qk_t* __restrict__ k_pe,        // [num_tokens, rot_dim]
+    const qk_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const qk_t* __restrict__ rope_cos_sin_cache,  // [max_position, 2,
+                                                  // rot_dim // 2]
+    const int rot_dim, const int64_t q_pe_stride_token,
+    const int64_t q_pe_stride_head, const int64_t k_pe_stride,
+    const int64_t kv_c_stride, const int num_q_heads,
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank +
+                                     // rot_dim)]
+    const int64_t* __restrict__ kv_cache_slot_mapping,  // [num_tokens]
+    const int block_stride, const int entry_stride, const int kv_lora_rank,
+    const int block_size, const float* kv_cache_quant_scale) {
+  // Each thread block is responsible for one token.
+  const int64_t token_idx = blockIdx.x;
+  const int64_t pos = positions[token_idx];
+
+  const qk_t* cos_sin_ptr = rope_cos_sin_cache + pos * rot_dim;
+
+  const int embed_dim = rot_dim / 2;
+
+  // Q ROPE
+  const int nq = num_q_heads * embed_dim;
+  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
+    int head_idx = i / embed_dim;
+    int pair_idx = i % embed_dim;
+
+    // NOTE: Would be nice to have interleaved sin/cos so we could just load
+    // both at the same time.
+    qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
+    qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+
+    qk_t* q_pe_head_ptr =
+        q_pe + token_idx * q_pe_stride_token + head_idx * q_pe_stride_head;
+
+    int pair_idx_x, pair_idx_y;
+    if constexpr (IS_NEOX) {
+      // GPT-NeoX style rotary embedding.
+      pair_idx_x = pair_idx;
+      pair_idx_y = embed_dim + pair_idx;
+    } else {
+      // GPT-J style rotary embedding.
+      pair_idx_x = pair_idx * 2;
+      pair_idx_y = pair_idx * 2 + 1;
+    }
+
+    qk_t x_src = q_pe_head_ptr[pair_idx_x];
+    qk_t y_src = q_pe_head_ptr[pair_idx_y];
+
+    qk_t x_dst = x_src * cos - y_src * sin;
+    qk_t y_dst = y_src * cos + x_src * sin;
+
+    q_pe_head_ptr[pair_idx_x] = x_dst;
+    q_pe_head_ptr[pair_idx_y] = y_dst;
+  }
+
+  const int64_t slot_idx = kv_cache_slot_mapping[token_idx];
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t entry_idx = slot_idx % block_size;
+
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+
+  // K with 1 HEAD
+  for (int i = threadIdx.x; i < embed_dim; i += blockDim.x) {
+    int pair_idx = i;
+
+    qk_t cos = VLLM_LDG(cos_sin_ptr + pair_idx);
+    qk_t sin = VLLM_LDG(cos_sin_ptr + pair_idx + embed_dim);
+
+    qk_t* k_pe_head_ptr = k_pe + token_idx * k_pe_stride;
+
+    int pair_idx_x, pair_idx_y;
+    if constexpr (IS_NEOX) {
+      // GPT-NeoX style rotary embedding.
+      pair_idx_x = pair_idx;
+      pair_idx_y = embed_dim + pair_idx;
+    } else {
+      // GPT-J style rotary embedding.
+      pair_idx_x = pair_idx * 2;
+      pair_idx_y = pair_idx * 2 + 1;
+    }
+
+    qk_t x_src = k_pe_head_ptr[pair_idx_x];
+    qk_t y_src = k_pe_head_ptr[pair_idx_y];
+
+    qk_t x_dst = x_src * cos - y_src * sin;
+    qk_t y_dst = y_src * cos + x_src * sin;
+
+    k_pe_head_ptr[pair_idx_x] = x_dst;
+    k_pe_head_ptr[pair_idx_y] = y_dst;
+
+    // NOTE Why is this monster necessary?
+    // When K is of type float16, the actual template replacement for
+    // raw_kv_scalar_t with be u16. That's why it's used at the last moment
+    // otherwise CUDA ALU would break.
+    const raw_kv_scalar_t raw_x_value =
+        *reinterpret_cast<const raw_kv_scalar_t*>(&x_dst);
+    const raw_kv_scalar_t raw_y_value =
+        *reinterpret_cast<const raw_kv_scalar_t*>(&y_dst);
+
+    cache_t* kv_cache_ptr = kv_cache + block_idx * block_stride +
+                            entry_idx * entry_stride + kv_lora_rank;
+
+    // MLA Cache Store
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      kv_cache_ptr[pair_idx_x] = raw_x_value;
+      kv_cache_ptr[pair_idx_y] = raw_y_value;
+    } else {
+      kv_cache_ptr[pair_idx_x] =
+          fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
+              raw_x_value, *kv_cache_quant_scale);
+      kv_cache_ptr[pair_idx_y] =
+          fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
+              raw_y_value, *kv_cache_quant_scale);
+    }
+  }
+
+  // NOPE
+  for (int i = threadIdx.x; i < kv_lora_rank; i += blockDim.x) {
+    const qk_t* src_ptr = kv_c + token_idx * kv_c_stride + i;
+    const raw_kv_scalar_t src_value =
+        *reinterpret_cast<const raw_kv_scalar_t*>(src_ptr);
+
+    cache_t* kv_cache_ptr =
+        kv_cache + block_idx * block_stride + entry_idx * entry_stride;
+
+    if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      kv_cache_ptr[i] = src_value;
+    } else {
+      kv_cache_ptr[i] = fp8::scaled_convert<cache_t, raw_kv_scalar_t, kv_dt>(
+          src_value, *kv_cache_quant_scale);
+    }
+  }
+}
+
+}  // namespace vllm
+
+#define CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED(RAW_KV_T, CACHE_T, KV_DTYPE)      \
+  do {                                                                         \
+    VLLM_DISPATCH_FLOATING_TYPES(q_pe.scalar_type(), "qk_scalar_type", [&] {   \
+      using qk_t = scalar_t;                                                   \
+      if (rope_is_neox) {                                                      \
+        vllm::concat_and_cache_mla_rope_fused_kernel<qk_t, true, RAW_KV_T,     \
+                                                     CACHE_T, KV_DTYPE>        \
+            <<<grid, block, 0, stream>>>(                                      \
+                positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(),          \
+                k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(),                  \
+                rope_cos_sin_cache.data_ptr<qk_t>(), rot_dim,                  \
+                q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
+                num_q_heads, reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),  \
+                kv_cache_slot_mapping.data_ptr<int64_t>(), block_stride,       \
+                entry_stride, kv_lora_rank, block_size,                        \
+                kv_cache_quant_scale.data_ptr<float>());                       \
+      } else {                                                                 \
+        vllm::concat_and_cache_mla_rope_fused_kernel<qk_t, false, RAW_KV_T,    \
+                                                     CACHE_T, KV_DTYPE>        \
+            <<<grid, block, 0, stream>>>(                                      \
+                positions.data_ptr<int64_t>(), q_pe.data_ptr<qk_t>(),          \
+                k_pe.data_ptr<qk_t>(), kv_c.data_ptr<qk_t>(),                  \
+                rope_cos_sin_cache.data_ptr<qk_t>(), rot_dim,                  \
+                q_pe_stride_token, q_pe_stride_head, k_pe_stride, kv_c_stride, \
+                num_q_heads, reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),  \
+                kv_cache_slot_mapping.data_ptr<int64_t>(), block_stride,       \
+                entry_stride, kv_lora_rank, block_size,                        \
+                kv_cache_quant_scale.data_ptr<float>());                       \
+      }                                                                        \
+    });                                                                        \
+  } while (false)
+
+// Executes RoPE on q_pe and k_pe, then writes k_pe and kv_c in the kv cache.
+// q_pe and k_pe are modified in place.
+// Replaces DeepseekScalingRotaryEmbedding.self.rotary_emb and
+// concat_and_cache_mla.
+void concat_and_cache_mla_rope_fused(
+    torch::Tensor& positions,           // [num_tokens]
+    torch::Tensor& q_pe,                // [num_tokens, num_q_heads, rot_dim]
+    torch::Tensor& k_pe,                // [num_tokens, rot_dim]
+    torch::Tensor& kv_c,                // [num_tokens, kv_lora_rank]
+    torch::Tensor& rope_cos_sin_cache,  // [max_position, rot_dim]
+    bool rope_is_neox,
+    torch::Tensor&
+        kv_cache_slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    torch::Tensor&
+        kv_cache,  // [num_blocks, block_size, (kv_lora_rank + rot_dim)]
+    const std::string& kv_cache_dtype, torch::Tensor& kv_cache_quant_scale) {
+  const int64_t num_tokens = q_pe.size(0);
+
+  const int num_q_heads = q_pe.size(1);
+  const int rot_dim = q_pe.size(2);
+  const int kv_lora_rank = kv_c.size(1);
+
+  TORCH_CHECK(positions.size(0) >=
+              num_tokens);  // CUDA Graphs might pad this for us
+  TORCH_CHECK_EQ(positions.dim(), 1);
+  TORCH_CHECK_EQ(positions.scalar_type(), c10::ScalarType::Long);
+
+  TORCH_CHECK_EQ(q_pe.size(0), num_tokens);
+  TORCH_CHECK_EQ(q_pe.size(1), num_q_heads);
+  TORCH_CHECK_EQ(q_pe.size(2), rot_dim);
+  TORCH_CHECK_EQ(q_pe.dim(), 3);
+
+  TORCH_CHECK_EQ(k_pe.size(0), num_tokens);
+  TORCH_CHECK_EQ(k_pe.size(1), rot_dim);
+  TORCH_CHECK_EQ(k_pe.dim(), 2);
+  TORCH_CHECK_EQ(k_pe.scalar_type(), q_pe.scalar_type());
+
+  TORCH_CHECK_EQ(kv_c.size(0), num_tokens);
+  TORCH_CHECK_EQ(kv_c.size(1), kv_lora_rank);
+  TORCH_CHECK_EQ(kv_c.dim(), 2);
+  TORCH_CHECK_EQ(kv_c.scalar_type(), q_pe.scalar_type());
+  TORCH_CHECK_EQ(kv_c.dtype(), q_pe.dtype());
+
+  TORCH_CHECK_EQ(rope_cos_sin_cache.size(1), rot_dim);
+  TORCH_CHECK_EQ(rope_cos_sin_cache.scalar_type(), q_pe.scalar_type());
+
+  TORCH_CHECK_EQ(kv_cache_slot_mapping.size(0), num_tokens);
+  TORCH_CHECK_EQ(kv_cache_slot_mapping.scalar_type(), c10::ScalarType::Long);
+
+  TORCH_CHECK_EQ(kv_cache.size(2), kv_lora_rank + rot_dim);
+  TORCH_CHECK_EQ(kv_cache.dim(), 3);
+
+  TORCH_CHECK_EQ(kv_cache_quant_scale.numel(), 1);
+  TORCH_CHECK_EQ(kv_cache_quant_scale.scalar_type(), c10::ScalarType::Float);
+
+  int64_t q_pe_stride_token = q_pe.stride(0);
+  int64_t q_pe_stride_head = q_pe.stride(1);
+
+  int64_t k_pe_stride = k_pe.stride(0);
+  int64_t kv_c_stride = kv_c.stride(0);
+
+  int block_size = kv_cache.size(1);
+
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  int rope_block_size = std::min(num_q_heads * rot_dim / 2, 512);
+  int mla_block_size = kv_lora_rank;
+  int thread_block_size =
+      std::min(std::max(rope_block_size, mla_block_size), 512);
+
+  dim3 grid(num_tokens, 1, 1);
+  dim3 block(thread_block_size, 1, 1);
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(positions));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CONCAT_AND_CACHE_MLA_ROPE_FUSED);
+}
--- a/csrc/cpu/cpu_attn_macros.h
+++ b/csrc/cpu/cpu_attn_macros.h
-#ifndef CPU_ATTN_MACROS_H
-#define CPU_ATTN_MACROS_H
+#ifndef CPU_ARCH_MACROS_H
+#define CPU_ARCH_MACROS_H

 // x86_64
 #ifdef __x86_64__
@@ -26,7 +26,7 @@
          _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));                  \
      const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);                   \
      const int n_mantissa_bits = 23;                                          \
-      auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__((              \
+      auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__((        \
                          always_inline)) {                                    \
        __m512 values = vec.reg;                                               \
        auto less_ln_flt_min_mask =                                            \
@@ -98,7 +98,7 @@
      poly = vbslq_f32(hi_mask, inf, poly);                                    \
      return vbslq_f32(lo_mask, zero, poly);                                   \
    };                                                                         \
-    auto fast_exp = [&](vec_op::FP32Vec16& vec)                                \
+    auto fast_exp = [&](const vec_op::FP32Vec16& vec)                          \
                        __attribute__((always_inline)) {                       \
                          float32x4x4_t result;                                \
                          result.val[0] = neon_expf(vec.reg.val[0]);           \
@@ -110,4 +110,4 @@

 #endif  // __aarch64__

-#endif
\ No newline at end of file
+#endif
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -15,6 +15,7 @@

 #ifdef __aarch64__
  #include "cpu_attn_neon.hpp"
+  // NEON requires head_dim to be a multiple of 32
  #define NEON_DISPATCH(...)                                                   \
    case cpu_attention::ISA::NEON: {                                           \
      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
@@ -36,7 +37,9 @@
    switch (HEAD_DIM) {                                         \
      CPU_ATTN_DISPATCH_CASE(32, __VA_ARGS__)                   \
      CPU_ATTN_DISPATCH_CASE(64, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(80, __VA_ARGS__)                   \
      CPU_ATTN_DISPATCH_CASE(96, __VA_ARGS__)                   \
+      CPU_ATTN_DISPATCH_CASE(112, __VA_ARGS__)                  \
      CPU_ATTN_DISPATCH_CASE(128, __VA_ARGS__)                  \
      CPU_ATTN_DISPATCH_CASE(160, __VA_ARGS__)                  \
      CPU_ATTN_DISPATCH_CASE(192, __VA_ARGS__)                  \

--- a/csrc/cpu/cpu_attn_amx.hpp
+++ b/csrc/cpu/cpu_attn_amx.hpp
@@ -377,7 +377,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
      const int32_t q_heads_per_kv, const int64_t q_num_stride,
      const int64_t q_head_stride, const float scale) {
    constexpr int64_t bytes_per_head = head_dim * sizeof(scalar_t);
-    static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
+    // static_assert(bytes_per_head % AMX_TILE_ROW_BYTES == 0);
    constexpr int64_t head_size_block_num = bytes_per_head / AMX_TILE_ROW_BYTES;
    constexpr int64_t head_elem_num_pre_block =
        AMX_TILE_ROW_BYTES / sizeof(scalar_t);

--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -8,10 +8,8 @@
  #include <sys/sysctl.h>
 #endif

-#include "cpu_types.hpp"
-#include "scratchpad_manager.h"
-#include "cpu_attn_macros.h"
-#include "utils.hpp"
+#include "cpu/cpu_arch_macros.h"
+#include "cpu/utils.hpp"

 namespace cpu_attention {
 enum class ISA { AMX, VEC, VEC16, NEON };
@@ -378,12 +376,13 @@ class AttentionScheduler {

  static constexpr int32_t MaxQTileIterNum = 128;

-  AttentionScheduler() : available_cache_size_(get_available_l2_size()) {}
+  AttentionScheduler()
+      : available_cache_size_(cpu_utils::get_available_l2_size()) {}

  torch::Tensor schedule(const ScheduleInput& input) const {
    const bool casual = input.casual;
    const int32_t thread_num = omp_get_max_threads();
-    const int64_t cache_size = get_available_l2_size();
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
    const int32_t max_num_q_per_iter = input.max_num_q_per_iter;
    const int32_t kv_len_alignment = input.kv_block_alignment;
    int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv;
@@ -659,7 +658,7 @@ class AttentionScheduler {
            metadata_ptr->thread_num +
        metadata_ptr->reduction_scratchpad_size_per_kv_head *
            (use_gqa ? input.num_heads_kv : input.num_heads_q);
-    DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(
+    cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(
        scratchpad_size);

    // metadata_ptr->print();
@@ -667,7 +666,7 @@ class AttentionScheduler {
    // test out of boundary access
    // {
    //     float* cache_ptr =
-    //     DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<float>();
+    //     cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data<float>();
    //     for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) {
    //         cache_ptr[i] = std::numeric_limits<float>::quiet_NaN();
    //     }
@@ -749,27 +748,6 @@ class AttentionScheduler {
    return std::max(rounded_tile_size, round_size);
  }

-  static int64_t get_available_l2_size() {
-    static int64_t size = []() {
-#if defined(__APPLE__)
-      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
-      int64_t l2_cache_size = 0;
-      size_t len = sizeof(l2_cache_size);
-      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
-          l2_cache_size > 0) {
-        return l2_cache_size >> 1;  // use 50% of L2 cache
-      }
-      // Fallback if sysctlbyname fails
-      return 128LL * 1024 >> 1;  // use 50% of 128KB
-#else
-      long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-      TORCH_CHECK_NE(l2_cache_size, -1);
-      return l2_cache_size >> 1;  // use 50% of L2 cache
-#endif
-    }();
-    return size;
-  }
-
 private:
  int64_t available_cache_size_;
 };
@@ -1402,7 +1380,7 @@ class AttentionMainLoop {

      // init buffers
      void* scratchpad_ptr =
-          DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+          cpu_utils::ScratchPadManager::get_scratchpad_manager()
              ->get_data<void>();
      AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr);

@@ -1422,8 +1400,7 @@ class AttentionMainLoop {
        }
      }

-      const int64_t available_cache_size =
-          AttentionScheduler::get_available_l2_size();
+      const int64_t available_cache_size = cpu_utils::get_available_l2_size();
      const int32_t default_tile_size =
          AttentionScheduler::calcu_default_tile_size(
              available_cache_size, head_dim, sizeof(kv_cache_t),

--- a/csrc/cpu/cpu_attn_neon.hpp
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -264,7 +264,7 @@ class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
  constexpr static ISA ISAType = ISA::NEON;
  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer

-  static_assert(HeadDim % HeadDimAlignment == 0);
+  //  static_assert(HeadDim % HeadDimAlignment == 0);
  // the gemm micro kernel is Mx8
  static_assert(HeadDimAlignment % 8 == 0);
  static_assert(BlockSizeAlignment % 8 == 0);

--- a/csrc/cpu/cpu_fused_moe.cpp
+++ b/csrc/cpu/cpu_fused_moe.cpp
+#include "cpu/cpu_types.hpp"
+#include "cpu/utils.hpp"
+#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
+#include "cpu/cpu_arch_macros.h"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
+  #define AMX_DISPATCH(...)                                                    \
+    case cpu_utils::ISA::AMX: {                                                \
+      using gemm_t = cpu_micro_gemm::MicroGemm<cpu_utils::ISA::AMX, scalar_t>; \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define AMX_DISPATCH(...) case cpu_utils::ISA::AMX:
+#endif
+
+#define CPU_ISA_DISPATCH_IMPL(ISA_TYPE, ...)                          \
+  [&] {                                                               \
+    switch (ISA_TYPE) {                                               \
+      AMX_DISPATCH(__VA_ARGS__)                                       \
+      case cpu_utils::ISA::VEC: {                                     \
+        using gemm_t =                                                \
+            cpu_micro_gemm::MicroGemm<cpu_utils::ISA::VEC, scalar_t>; \
+        return __VA_ARGS__();                                         \
+      }                                                               \
+      default: {                                                      \
+        TORCH_CHECK(false, "Invalid CPU ISA type.");                  \
+      }                                                               \
+    }                                                                 \
+  }()
+
+namespace {
+enum class FusedMOEAct { SiluAndMul, SwigluOAIAndMul };
+
+FusedMOEAct get_act_type(const std::string& act) {
+  if (act == "silu") {
+    return FusedMOEAct::SiluAndMul;
+  } else if (act == "swigluoai") {
+    return FusedMOEAct::SwigluOAIAndMul;
+  } else {
+    TORCH_CHECK(false, "Invalid act type: " + act);
+  }
+}
+
+template <typename scalar_t>
+void swigluoai_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
+                       const int32_t m_size, const int32_t n_size,
+                       const int32_t input_stride,
+                       const int32_t output_stride) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  // For GPT-OSS interleaved gate-up weights
+  alignas(64) static int32_t index[16] = {0,  2,  4,  6,  8,  10, 12, 14,
+                                          16, 18, 20, 22, 24, 26, 28, 30};
+  vec_op::INT32Vec16 index_vec(index);
+  vec_op::FP32Vec16 gate_up_max_vec(7.0);
+  vec_op::FP32Vec16 up_min_vec(-7.0);
+  vec_op::FP32Vec16 alpha_vec(1.702);
+  vec_op::FP32Vec16 one_vec(1.0);
+
+  DEFINE_FAST_EXP
+
+  for (int32_t m = 0; m < m_size; ++m) {
+    for (int32_t n = 0; n < n_size; n += 32) {
+      vec_op::FP32Vec16 gate_vec(input + n, index_vec);
+      vec_op::FP32Vec16 up_vec(input + n + 1, index_vec);
+      gate_vec = gate_vec.min(gate_up_max_vec);
+      up_vec = up_vec.clamp(up_min_vec, gate_up_max_vec);
+      auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec * alpha_vec));
+      auto glu = gate_vec * sigmoid_vec;
+      auto gated_output_fp32 = (one_vec + up_vec) * glu;
+      scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
+      gated_output.save(output + n / 2);
+    }
+    input += input_stride;
+    output += output_stride;
+  }
+}
+
+template <typename scalar_t>
+void silu_and_mul(float* __restrict__ input, scalar_t* __restrict__ output,
+                  const int32_t m_size, const int32_t n_size,
+                  const int32_t input_stride, const int32_t output_stride) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  const int32_t dim = n_size / 2;
+  float* __restrict__ gate = input;
+  float* __restrict__ up = input + dim;
+  vec_op::FP32Vec16 one_vec(1.0);
+
+  DEFINE_FAST_EXP
+
+  for (int32_t m = 0; m < m_size; ++m) {
+    for (int32_t n = 0; n < dim; n += 16) {
+      vec_op::FP32Vec16 gate_vec(gate + n);
+      vec_op::FP32Vec16 up_vec(up + n);
+      auto sigmoid_vec = one_vec / (one_vec + fast_exp(-gate_vec));
+      auto silu = gate_vec * sigmoid_vec;
+      auto gated_output_fp32 = up_vec * silu;
+      scalar_vec_t gated_output = scalar_vec_t(gated_output_fp32);
+      gated_output.save(output + n);
+    }
+    gate += input_stride;
+    up += input_stride;
+    output += output_stride;
+  }
+}
+
+template <typename scalar_t>
+FORCE_INLINE void apply_gated_act(const FusedMOEAct act,
+                                  float* __restrict__ input,
+                                  scalar_t* __restrict__ output,
+                                  const int32_t m, const int32_t n,
+                                  const int32_t input_stride,
+                                  const int32_t output_stride) {
+  switch (act) {
+    case FusedMOEAct::SwigluOAIAndMul:
+      swigluoai_and_mul(input, output, m, n, input_stride, output_stride);
+      return;
+    case FusedMOEAct::SiluAndMul:
+      silu_and_mul(input, output, m, n, input_stride, output_stride);
+      return;
+    default:
+      TORCH_CHECK(false, "Unsupported act type.");
+  }
+}
+
+template <typename scalar_t, typename gemm_t>
+void prepack_moe_weight_impl(scalar_t* __restrict__ weight_ptr,
+                             scalar_t* __restrict__ packed_weight_ptr,
+                             const int32_t expert_num,
+                             const int32_t output_size,
+                             const int32_t input_size,
+                             const int64_t expert_stride) {
+#pragma omp parallel for
+  for (int32_t e_idx = 0; e_idx < expert_num; ++e_idx) {
+    gemm_t::pack_weight(weight_ptr + expert_stride * e_idx,
+                        packed_weight_ptr + expert_stride * e_idx, output_size,
+                        input_size);
+  }
+}
+
+template <typename scalar_t, typename w_t, typename gemm_t>
+void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input,
+                    w_t* __restrict__ w13, w_t* __restrict__ w2,
+                    w_t* __restrict__ w13_bias, w_t* __restrict__ w2_bias,
+                    float* __restrict__ topk_weights,
+                    int32_t* __restrict__ topk_id, FusedMOEAct act_type,
+                    const int32_t token_num, const int32_t expert_num,
+                    const int32_t topk_num, const int32_t input_size_13,
+                    const int32_t output_size_13, const int32_t input_size_2,
+                    const int32_t output_size_2) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
+  constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
+  constexpr int32_t min_w13_n_tile_size = 2 * gemm_n_tile_size;
+  static_assert(gemm_n_tile_size % 16 == 0);
+
+  TORCH_CHECK_EQ(output_size_13 % min_w13_n_tile_size, 0);
+  TORCH_CHECK_EQ(output_size_2 % gemm_n_tile_size, 0);
+  TORCH_CHECK_EQ(output_size_13 / 2, input_size_2);
+
+  const int32_t thread_num = omp_get_max_threads();
+
+  const int32_t w13_input_buffer_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * input_size_13 * sizeof(scalar_t));
+
+  const int32_t w13_n_tile_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
+    // input buffer + output buffer + weight
+    const int32_t n_size_cache_limit =
+        (cache_size - w13_input_buffer_size) /
+        (gemm_m_tile_size * sizeof(float) + input_size_13 * sizeof(scalar_t));
+    const int32_t n_size_thread_limit =
+        output_size_13 / std::max(1, thread_num / topk_num);
+    const int32_t n_size = cpu_utils::round_down<min_w13_n_tile_size>(
+        std::min(n_size_cache_limit, n_size_thread_limit));
+    return std::max(n_size, min_w13_n_tile_size);
+  }();
+
+  const int32_t w2_input_tile_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * input_size_2 * sizeof(scalar_t));
+
+  const int32_t w2_n_tile_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_available_l2_size();
+    // input tile + weight
+    const int32_t n_size_cache_limit =
+        (cache_size - w2_input_tile_size) / (input_size_2 * sizeof(scalar_t));
+    const int32_t n_size_thread_limit =
+        output_size_2 / std::max(1, thread_num / topk_num);
+    const int32_t n_size = cpu_utils::round_down<gemm_n_tile_size>(
+        std::min(n_size_cache_limit, n_size_thread_limit));
+    return std::max(n_size, gemm_n_tile_size);
+  }();
+
+  // allocate buffers
+  int32_t common_buffer_offset = 0;
+  int32_t w13_thread_buffer_offset = 0;
+  int32_t ws_thread_buffer_offset = 0;
+
+  // common buffers
+  const int32_t token_num_per_group_buffer_size =
+      cpu_utils::round_up<64>(expert_num * sizeof(int32_t));
+  const int32_t token_num_per_group_buffer_offset = common_buffer_offset;
+  common_buffer_offset += token_num_per_group_buffer_size;
+
+  const int32_t cu_token_num_per_group_buffer_size =
+      cpu_utils::round_up<64>((expert_num + 1) * sizeof(int32_t));
+  const int32_t cu_token_num_per_group_buffer_offset = common_buffer_offset;
+  common_buffer_offset += cu_token_num_per_group_buffer_size;
+
+  const int32_t expand_token_id_buffer_size =
+      cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
+  const int32_t expand_token_id_buffer_offset = common_buffer_offset;
+  common_buffer_offset += expand_token_id_buffer_size;
+
+  const int32_t expand_token_id_index_buffer_size =
+      cpu_utils::round_up<64>(token_num * topk_num * sizeof(int32_t));
+  const int32_t expand_token_id_index_buffer_offset = common_buffer_offset;
+  common_buffer_offset += expand_token_id_index_buffer_size;
+
+  const int32_t w13_gemm_output_buffer_size = cpu_utils::round_up<64>(
+      token_num * topk_num * (output_size_13 / 2) * sizeof(scalar_t));
+  const int32_t w13_gemm_output_buffer_offset = common_buffer_offset;
+  common_buffer_offset += w13_gemm_output_buffer_size;
+
+  const int32_t w2_gemm_output_buffer_size = cpu_utils::round_up<64>(
+      token_num * topk_num * output_size_2 * sizeof(float));
+  const int32_t w2_gemm_output_buffer_offset = common_buffer_offset;
+  common_buffer_offset += w2_gemm_output_buffer_size;
+
+  // w13 GEMM thread buffers
+  const int32_t w13_input_buffer_offset = w13_thread_buffer_offset;
+  w13_thread_buffer_offset += w13_input_buffer_size;
+
+  const int32_t w13_output_buffer_size = cpu_utils::round_up<64>(
+      gemm_m_tile_size * w13_n_tile_size * sizeof(float));
+  const int32_t w13_output_buffer_offset = w13_thread_buffer_offset;
+  w13_thread_buffer_offset += w13_output_buffer_size;
+
+  // Weighted sum thread buffer
+  const int32_t ws_output_buffer_size =
+      cpu_utils::round_up<64>(output_size_2 * sizeof(float));
+  const int32_t ws_output_buffer_offset = ws_thread_buffer_offset;
+  ws_thread_buffer_offset += ws_output_buffer_size;
+
+  const int32_t buffer_size =
+      common_buffer_offset +
+      std::max(w13_thread_buffer_offset, ws_thread_buffer_offset) * thread_num;
+  cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc(buffer_size);
+  uint8_t* common_buffer_start =
+      cpu_utils::ScratchPadManager::get_scratchpad_manager()
+          ->get_data<uint8_t>();
+  uint8_t* thread_buffer_start = common_buffer_start + common_buffer_offset;
+
+  int32_t* __restrict__ token_num_per_group_buffer = reinterpret_cast<int32_t*>(
+      common_buffer_start + token_num_per_group_buffer_offset);
+  int32_t* __restrict__ cu_token_num_per_group_buffer =
+      reinterpret_cast<int32_t*>(common_buffer_start +
+                                 cu_token_num_per_group_buffer_offset);
+  int32_t* __restrict__ expand_token_id_buffer = reinterpret_cast<int32_t*>(
+      common_buffer_start + expand_token_id_buffer_offset);
+  int32_t* __restrict__ expand_token_id_index_buffer =
+      reinterpret_cast<int32_t*>(common_buffer_start +
+                                 expand_token_id_index_buffer_offset);
+
+  // prepare token-expert mappings
+  {
+    std::memset(token_num_per_group_buffer, 0, expert_num * sizeof(int32_t));
+    for (int32_t i = 0; i < token_num * topk_num; ++i) {
+      int32_t curr_expert_id = topk_id[i];
+      ++token_num_per_group_buffer[curr_expert_id];
+    }
+
+    int32_t token_num_sum = 0;
+    cu_token_num_per_group_buffer[0] = 0;
+    int32_t* token_index_buffer = cu_token_num_per_group_buffer + 1;
+    for (int32_t i = 0; i < expert_num; ++i) {
+      token_index_buffer[i] = token_num_sum;
+      token_num_sum += token_num_per_group_buffer[i];
+    }
+
+    for (int32_t i = 0; i < token_num; ++i) {
+      int32_t* curr_topk_id = topk_id + i * topk_num;
+      int32_t* curr_index_buffer = expand_token_id_index_buffer + i * topk_num;
+      for (int32_t j = 0; j < topk_num; ++j) {
+        int32_t curr_expert_id = curr_topk_id[j];
+        int32_t curr_index = token_index_buffer[curr_expert_id];
+        ++token_index_buffer[curr_expert_id];
+        expand_token_id_buffer[curr_index] = i;
+        curr_index_buffer[j] = curr_index;
+      }
+    }
+  }
+
+  // w13 GEMM + act
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num_per_expert =
+          (output_size_13 + w13_n_tile_size - 1) / w13_n_tile_size;
+      const int32_t task_num = task_num_per_expert * expert_num;
+
+      uint8_t* __restrict__ thread_buffer =
+          thread_buffer_start + thread_id * w13_thread_buffer_offset;
+      scalar_t* __restrict__ w13_input_buffer =
+          reinterpret_cast<scalar_t*>(thread_buffer + w13_input_buffer_offset);
+      float* __restrict__ w13_output_buffer =
+          reinterpret_cast<float*>(thread_buffer + w13_output_buffer_offset);
+      scalar_t* __restrict__ w13_gemm_output_buffer =
+          reinterpret_cast<scalar_t*>(common_buffer_start +
+                                      w13_gemm_output_buffer_offset);
+
+      gemm_t gemm;
+
+      const int32_t input_size_13_bytes = input_size_13 * sizeof(scalar_t);
+      const int32_t w13_n_group_stride = 16 * input_size_13;
+      const int32_t w13_n_tile_stride = gemm_n_tile_size * input_size_13;
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        const int32_t curr_expert_id = task_id / task_num_per_expert;
+        const int32_t curr_output_group_id = task_id % task_num_per_expert;
+        const int32_t curr_token_num =
+            token_num_per_group_buffer[curr_expert_id];
+        if (curr_token_num == 0) {
+          continue;
+        }
+
+        const int32_t actual_n_tile_size =
+            std::min(w13_n_tile_size,
+                     output_size_13 - curr_output_group_id * w13_n_tile_size);
+        const int32_t* __restrict__ curr_expand_token_id_buffer =
+            expand_token_id_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id];
+        scalar_t* __restrict__ curr_w13_gemm_output_buffer =
+            w13_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] *
+                (output_size_13 / 2) +
+            curr_output_group_id * w13_n_tile_size / 2;
+
+        w_t* __restrict__ w13_weight_ptr_0 = nullptr;
+        w_t* __restrict__ w13_weight_ptr_1 = nullptr;
+        w_t* __restrict__ w13_bias_ptr_0 = nullptr;
+        w_t* __restrict__ w13_bias_ptr_1 = nullptr;
+        if (act_type == FusedMOEAct::SwigluOAIAndMul) {
+          // For SwigluOAIAndMul, up and down weights are interleaved
+          w13_weight_ptr_0 =
+              w13 + curr_expert_id * input_size_13 * output_size_13 +
+              curr_output_group_id * w13_n_tile_size * input_size_13;
+          w13_weight_ptr_1 =
+              w13_weight_ptr_0 + actual_n_tile_size / 2 * input_size_13;
+          if (w13_bias != nullptr) {
+            w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
+                             curr_output_group_id * w13_n_tile_size;
+            w13_bias_ptr_1 = w13_bias_ptr_0 + actual_n_tile_size / 2;
+          }
+        } else {
+          w13_weight_ptr_0 =
+              w13 + curr_expert_id * input_size_13 * output_size_13 +
+              curr_output_group_id * (w13_n_tile_size / 2) * input_size_13;
+          w13_weight_ptr_1 =
+              w13_weight_ptr_0 + output_size_13 / 2 * input_size_13;
+          if (w13_bias != nullptr) {
+            w13_bias_ptr_0 = w13_bias + curr_expert_id * output_size_13 +
+                             curr_output_group_id * (w13_n_tile_size / 2);
+            w13_bias_ptr_1 = w13_bias_ptr_0 + output_size_13 / 2;
+          }
+        }
+
+        scalar_t* __restrict__ curr_w13_input_buffer = w13_input_buffer;
+        for (int32_t token_idx = 0; token_idx < curr_token_num;
+             token_idx += gemm_m_tile_size) {
+          const int32_t actual_token_num =
+              std::min(gemm_m_tile_size, curr_token_num - token_idx);
+          // copy inputs
+          {
+            scalar_t* __restrict__ curr_w13_input_buffer_iter =
+                curr_w13_input_buffer;
+            for (int32_t i = 0; i < actual_token_num; ++i) {
+              const int32_t curr_token_id = curr_expand_token_id_buffer[i];
+              int8_t* __restrict__ curr_input_iter = reinterpret_cast<int8_t*>(
+                  input + curr_token_id * input_size_13);
+              int8_t* __restrict__ curr_output_iter =
+                  reinterpret_cast<int8_t*>(curr_w13_input_buffer_iter);
+              int32_t j = 0;
+              for (; j < input_size_13_bytes - 64; j += 64) {
+                vec_op::INT8Vec64 vec(curr_input_iter);
+                vec.save(curr_output_iter);
+                curr_input_iter += 64;
+                curr_output_iter += 64;
+              }
+              vec_op::INT8Vec64 vec(curr_input_iter);
+              vec.save(curr_output_iter, input_size_13_bytes - j);
+
+              // update
+              curr_w13_input_buffer_iter += input_size_13;
+            }
+            // update
+            curr_expand_token_id_buffer += actual_token_num;
+          }
+
+          // gemm + act
+          {
+            scalar_t* __restrict__ w13_weight_ptr_0_iter = w13_weight_ptr_0;
+            scalar_t* __restrict__ w13_weight_ptr_1_iter = w13_weight_ptr_1;
+            scalar_t* __restrict__ w13_bias_ptr_0_iter = w13_bias_ptr_0;
+            scalar_t* __restrict__ w13_bias_ptr_1_iter = w13_bias_ptr_1;
+            scalar_t* __restrict__ curr_w13_input_buffer_iter =
+                curr_w13_input_buffer;
+            float* __restrict__ w13_output_buffer_0_iter = w13_output_buffer;
+            float* __restrict__ w13_output_buffer_1_iter =
+                w13_output_buffer + actual_n_tile_size / 2;
+            for (int32_t i = 0; i < actual_n_tile_size;
+                 i += min_w13_n_tile_size) {
+              gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_0_iter,
+                        w13_output_buffer_0_iter, actual_token_num,
+                        input_size_13, input_size_13, w13_n_group_stride,
+                        actual_n_tile_size, false);
+
+              if (w13_bias != nullptr) {
+                cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                    w13_output_buffer_0_iter, w13_output_buffer_0_iter,
+                    w13_bias_ptr_0_iter, actual_token_num, actual_n_tile_size,
+                    actual_n_tile_size);
+                w13_bias_ptr_0_iter += gemm_n_tile_size;
+              }
+
+              gemm.gemm(curr_w13_input_buffer_iter, w13_weight_ptr_1_iter,
+                        w13_output_buffer_1_iter, actual_token_num,
+                        input_size_13, input_size_13, w13_n_group_stride,
+                        actual_n_tile_size, false);
+
+              if (w13_bias != nullptr) {
+                cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                    w13_output_buffer_1_iter, w13_output_buffer_1_iter,
+                    w13_bias_ptr_1_iter, actual_token_num, actual_n_tile_size,
+                    actual_n_tile_size);
+                w13_bias_ptr_1_iter += gemm_n_tile_size;
+              }
+
+              // update
+              w13_weight_ptr_0_iter += w13_n_tile_stride;
+              w13_weight_ptr_1_iter += w13_n_tile_stride;
+              w13_output_buffer_0_iter += gemm_n_tile_size;
+              w13_output_buffer_1_iter += gemm_n_tile_size;
+            }
+
+            apply_gated_act(act_type, w13_output_buffer,
+                            curr_w13_gemm_output_buffer, actual_token_num,
+                            actual_n_tile_size, actual_n_tile_size,
+                            output_size_13 / 2);
+
+            // update
+            curr_w13_gemm_output_buffer +=
+                gemm_m_tile_size * (output_size_13 / 2);
+          }
+        }
+      }
+    }
+  }
+
+  // w2 GEMM
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num_per_expert =
+          (output_size_2 + w2_n_tile_size - 1) / w2_n_tile_size;
+      const int32_t task_num = task_num_per_expert * expert_num;
+      scalar_t* __restrict__ w13_gemm_output_buffer =
+          reinterpret_cast<scalar_t*>(common_buffer_start +
+                                      w13_gemm_output_buffer_offset);
+      float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
+          common_buffer_start + w2_gemm_output_buffer_offset);
+
+      gemm_t gemm;
+
+      const int32_t w2_n_tile_stride = gemm_n_tile_size * input_size_2;
+      const int32_t w2_n_group_stride = 16 * input_size_2;
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        const int32_t curr_expert_id = task_id / task_num_per_expert;
+        const int32_t curr_output_group_id = task_id % task_num_per_expert;
+        const int32_t curr_token_num =
+            token_num_per_group_buffer[curr_expert_id];
+        if (curr_token_num == 0) {
+          continue;
+        }
+
+        const int32_t actual_n_tile_size =
+            std::min(w2_n_tile_size,
+                     output_size_2 - curr_output_group_id * w2_n_tile_size);
+        scalar_t* __restrict__ curr_w13_gemm_output_buffer =
+            w13_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] * input_size_2;
+        float* __restrict__ curr_w2_gemm_output_buffer =
+            w2_gemm_output_buffer +
+            cu_token_num_per_group_buffer[curr_expert_id] * output_size_2 +
+            curr_output_group_id * w2_n_tile_size;
+        scalar_t* __restrict__ w2_weight_ptr =
+            w2 + curr_expert_id * output_size_2 * input_size_2 +
+            curr_output_group_id * w2_n_tile_size * input_size_2;
+        scalar_t* __restrict__ w2_bias_ptr = nullptr;
+        if (w2_bias != nullptr) {
+          w2_bias_ptr = w2_bias + curr_expert_id * output_size_2 +
+                        curr_output_group_id * w2_n_tile_size;
+        }
+
+        for (int32_t token_idx = 0; token_idx < curr_token_num;
+             token_idx += gemm_m_tile_size) {
+          const int32_t actual_token_num =
+              std::min(gemm_m_tile_size, curr_token_num - token_idx);
+
+          scalar_t* __restrict__ w2_weight_ptr_iter = w2_weight_ptr;
+          scalar_t* __restrict__ w2_bias_ptr_iter = w2_bias_ptr;
+          float* __restrict__ curr_w2_gemm_output_buffer_iter =
+              curr_w2_gemm_output_buffer;
+          for (int32_t i = 0; i < actual_n_tile_size; i += gemm_n_tile_size) {
+            gemm.gemm(curr_w13_gemm_output_buffer, w2_weight_ptr_iter,
+                      curr_w2_gemm_output_buffer_iter, actual_token_num,
+                      input_size_2, input_size_2, w2_n_group_stride,
+                      output_size_2, false);
+
+            if (w2_bias != nullptr) {
+              cpu_micro_gemm::add_bias_epilogue<gemm_n_tile_size>(
+                  curr_w2_gemm_output_buffer_iter,
+                  curr_w2_gemm_output_buffer_iter, w2_bias_ptr_iter,
+                  actual_token_num, output_size_2, output_size_2);
+              w2_bias_ptr_iter += gemm_n_tile_size;
+            }
+
+            w2_weight_ptr_iter += w2_n_tile_stride;
+            curr_w2_gemm_output_buffer_iter += gemm_n_tile_size;
+          }
+
+          // update
+          curr_w13_gemm_output_buffer += gemm_m_tile_size * input_size_2;
+          curr_w2_gemm_output_buffer += gemm_m_tile_size * output_size_2;
+        }
+      }
+    }
+  }
+
+  // weighted sum
+  {
+    alignas(64) cpu_utils::Counter counter;
+    cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+    for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+      const int32_t task_num = token_num;
+      uint8_t* __restrict__ thread_buffer =
+          thread_buffer_start + thread_id * ws_thread_buffer_offset;
+      float* __restrict__ ws_output_buffer =
+          reinterpret_cast<float*>(thread_buffer + ws_output_buffer_offset);
+      float* __restrict__ w2_gemm_output_buffer = reinterpret_cast<float*>(
+          common_buffer_start + w2_gemm_output_buffer_offset);
+
+      for (;;) {
+        int32_t task_id = counter_ptr->acquire_counter();
+        if (task_id >= task_num) {
+          break;
+        }
+
+        int32_t token_id = task_id;
+        int32_t* __restrict__ curr_expand_token_id_index_buffer =
+            expand_token_id_index_buffer + token_id * topk_num;
+        float* __restrict__ curr_weight = topk_weights + token_id * topk_num;
+        scalar_t* __restrict__ curr_output_buffer =
+            output + token_id * output_size_2;
+
+        if (topk_num > 1) {
+          {
+            int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
+            float* __restrict__ w2_output_iter =
+                w2_gemm_output_buffer + w2_output_idx * output_size_2;
+            float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+            vec_op::FP32Vec16 weight_vec(curr_weight[0]);
+            for (int32_t i = 0; i < output_size_2; i += 16) {
+              vec_op::FP32Vec16 vec(w2_output_iter);
+              vec = vec * weight_vec;
+              vec.save(ws_output_buffer_iter);
+
+              // update
+              w2_output_iter += 16;
+              ws_output_buffer_iter += 16;
+            }
+          }
+
+          {
+            for (int32_t idx = 1; idx < topk_num - 1; ++idx) {
+              int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
+              float* __restrict__ w2_output_iter =
+                  w2_gemm_output_buffer + w2_output_idx * output_size_2;
+              float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+              vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
+              for (int32_t i = 0; i < output_size_2; i += 16) {
+                vec_op::FP32Vec16 vec(w2_output_iter);
+                vec_op::FP32Vec16 sum(ws_output_buffer_iter);
+                sum = sum + vec * weight_vec;
+                sum.save(ws_output_buffer_iter);
+
+                // update
+                w2_output_iter += 16;
+                ws_output_buffer_iter += 16;
+              }
+            }
+          }
+
+          {
+            int32_t idx = topk_num - 1;
+            int32_t w2_output_idx = curr_expand_token_id_index_buffer[idx];
+            float* __restrict__ w2_output_iter =
+                w2_gemm_output_buffer + w2_output_idx * output_size_2;
+            float* __restrict__ ws_output_buffer_iter = ws_output_buffer;
+            scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+            vec_op::FP32Vec16 weight_vec(curr_weight[idx]);
+            for (int32_t i = 0; i < output_size_2; i += 16) {
+              vec_op::FP32Vec16 vec(w2_output_iter);
+              vec_op::FP32Vec16 sum(ws_output_buffer_iter);
+              sum = sum + vec * weight_vec;
+              scalar_vec_t out_vec(sum);
+              out_vec.save(curr_output_buffer_iter);
+
+              // update
+              w2_output_iter += 16;
+              ws_output_buffer_iter += 16;
+              curr_output_buffer_iter += 16;
+            }
+          }
+        } else {
+          int32_t w2_output_idx = curr_expand_token_id_index_buffer[0];
+          float* __restrict__ w2_output_iter =
+              w2_gemm_output_buffer + w2_output_idx * output_size_2;
+          scalar_t* __restrict__ curr_output_buffer_iter = curr_output_buffer;
+          vec_op::FP32Vec16 weight_vec(curr_weight[0]);
+          for (int32_t i = 0; i < output_size_2; i += 16) {
+            vec_op::FP32Vec16 vec(w2_output_iter);
+            vec = vec * weight_vec;
+            scalar_vec_t out_vec(vec);
+            out_vec.save(curr_output_buffer_iter);
+
+            // update
+            w2_output_iter += 16;
+            curr_output_buffer_iter += 16;
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
+void prepack_moe_weight(
+    const torch::Tensor& weight,  // [expert_num, output_size, input_size]
+    torch::Tensor& packed_weight, const std::string& isa) {
+  TORCH_CHECK(weight.is_contiguous());
+  const int32_t expert_num = weight.size(0);
+  const int32_t output_size = weight.size(1);
+  const int32_t input_size = weight.size(2);
+  TORCH_CHECK_EQ(output_size % 32, 0);
+  const int64_t expert_stride = weight.stride(0);
+  cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      weight.scalar_type(), "prepack_moe_weight", [&]() {
+        CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
+          scalar_t* weight_ptr = weight.data_ptr<scalar_t>();
+          scalar_t* packed_weight_ptr = packed_weight.data_ptr<scalar_t>();
+          prepack_moe_weight_impl<scalar_t, gemm_t>(
+              weight_ptr, packed_weight_ptr, expert_num, output_size,
+              input_size, expert_stride);
+        });
+      });
+}
+
+void cpu_fused_moe(
+    torch::Tensor& output,       // [token_num, output_size_2]
+    const torch::Tensor& input,  // [token_num, input_size_13]
+    const torch::Tensor&
+        w13,  // [expert_num, output_size_13, input_size_13], packed
+    const torch::Tensor&
+        w2,  // [expert_num, output_size_2, input_size_2], packed
+    const std::optional<torch::Tensor>&
+        w13_bias,  // [expert_num, output_size_13]
+    const std::optional<torch::Tensor>& w2_bias,  // [expert_num, output_size_2]
+    const torch::Tensor& topk_weights,            // [token_num, k], float32
+    const torch::Tensor& topk_id,                 // [token_num, k], int32
+    const std::string& act, const std::string& isa) {
+  const int32_t token_num = input.size(0);
+  const int32_t input_size_13 = input.size(1);
+  const int64_t input_stride = input.stride(0);
+  TORCH_CHECK_EQ(input_stride, input_size_13);
+  const int32_t expert_num = w13.size(0);
+  const int32_t output_size_13 = w13.size(1);
+  const int32_t input_size_2 = w2.size(2);
+  const int32_t output_size_2 = w2.size(1);
+  const int32_t topk_num = topk_id.size(1);
+  const FusedMOEAct act_type = get_act_type(act);
+  cpu_utils::ISA isa_type = cpu_utils::get_isa(isa);
+
+  VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() {
+    CPU_ISA_DISPATCH_IMPL(isa_type, [&]() {
+      fused_moe_impl<scalar_t, scalar_t, gemm_t>(
+          output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+          w13.data_ptr<scalar_t>(), w2.data_ptr<scalar_t>(),
+          w13_bias.has_value() ? w13_bias->data_ptr<scalar_t>() : nullptr,
+          w2_bias.has_value() ? w2_bias->data_ptr<scalar_t>() : nullptr,
+          topk_weights.data_ptr<float>(), topk_id.data_ptr<int32_t>(), act_type,
+          token_num, expert_num, topk_num, input_size_13, output_size_13,
+          input_size_2, output_size_2);
+    });
+  });
+}
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -352,6 +352,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  explicit FP32Vec16(bool, void* ptr)
      : reg((__m512)_mm512_stream_load_si512(ptr)) {}

+  // strided load
+  explicit FP32Vec16(const float* ptr, INT32Vec16 idx)
+      : reg(_mm512_i32gather_ps(idx.reg, ptr, 4)) {}
+
  explicit FP32Vec16(__m512 data) : reg(data) {}

  // de-pack 4 bit values
@@ -408,6 +412,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
  }

+  FP32Vec16 operator-() const {
+    return FP32Vec16(_mm512_xor_ps(reg, _mm512_set1_ps(-0.0f)));
+  }
+
  FP32Vec16 operator/(const FP32Vec16& b) const {
    return FP32Vec16(_mm512_div_ps(reg, b.reg));
  }