style: upgrade the linter (#339)

* style: reformated codes * style: reformated codes

style: upgrade the linter (#339)
* style: reformated codes * style: reformated codes
57e50f8d · Muyang Li · GitHub · b737368d · 57e50f8d · 57e50f8d
Unverified Commit 57e50f8d authored May 01, 2025 by Muyang Li Committed by GitHub May 01, 2025
20 changed files
--- a/src/kernels/dwconv.cu
+++ b/src/kernels/dwconv.cu
@@ -219,7 +219,6 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
    // weight = weight.copy(weight.device());

    dispatchF16(weight.dtype(), [&]<typename half_t>() {
-
        using ElementOutput          = half_t;
        using ElementAccumulator     = half_t;
        using ElementComputeEpilogue = half_t;
@@ -238,9 +237,12 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
        using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;

        using DepthwiseDirect2dConv = typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConvFprop<
-            ElementInputA, LayoutInputA,
-            ElementInputB, LayoutInputB,
-            ElementOutput, LayoutOutput,
+            ElementInputA,
+            LayoutInputA,
+            ElementInputB,
+            LayoutInputB,
+            ElementOutput,
+            LayoutOutput,
            ElementAccumulator,
            cutlass::arch::OpClassSimt,
            cutlass::arch::Sm80,
@@ -249,12 +251,11 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
            FilterShape,
            WarpShape,
            InstructionShape,
-            cutlass::epilogue::thread::LinearCombination<
-                ElementOutput, 
+            cutlass::epilogue::thread::LinearCombination<ElementOutput,
                                                         128 / cutlass::sizeof_bits<ElementOutput>::value,
-                ElementOutput, ElementComputeEpilogue>,
-            cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
-                1,
+                                                         ElementOutput,
+                                                         ElementComputeEpilogue>,
+            cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<1,
                                                                                        ThreadBlockOutputShape::kN,
                                                                                        ThreadBlockOutputShape::kH,
                                                                                        ThreadBlockOutputShape::kW>,
@@ -267,8 +268,7 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {

        using DeviceKernel = typename cutlass::conv::device::DirectConvolution<DepthwiseDirect2dConv>;

-        cutlass::conv::Conv2dProblemSize problem_size(
-            cutlass::Tensor4DCoord(N, H, W, C_),
+        cutlass::conv::Conv2dProblemSize problem_size(cutlass::Tensor4DCoord(N, H, W, C_),
                                                      cutlass::Tensor4DCoord(K, R, S, C__),
                                                      cutlass::Tensor4DCoord(1, 1, 1, 1),
                                                      cutlass::MatrixCoord(1, 1),
@@ -292,11 +292,17 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {

        Tensor tmp_weight = Tensor::empty_like(weight);

-        cutlass::TensorRef<ElementInputA, LayoutInputA> a_ref(input.data_ptr<ElementInputA>(), LayoutInputA(input.stride(2), input.stride(1), input.stride(0)));
-        cutlass::TensorRef<ElementInputB, LayoutInputB> b_ref(weight.data_ptr<ElementInputB>(), LayoutInputB(weight.stride(2), weight.stride(1), weight.stride(0)));
-        cutlass::TensorRef<ElementOutput, LayoutOutput> c_ref(bias.valid() ? bias.data_ptr<ElementOutput>() : out.data_ptr<ElementOutput>(), LayoutOutput(0, 0, 0));
-        cutlass::TensorRef<ElementOutput, LayoutOutput> d_ref(out.data_ptr<ElementOutput>(), LayoutOutput(out.stride(2), out.stride(1), out.stride(0)));
-        cutlass::TensorRef<ElementOutput, LayoutOutput> tmpw_ref(tmp_weight.data_ptr<ElementOutput>(), LayoutOutput(tmp_weight.stride(2), tmp_weight.stride(1), tmp_weight.stride(0)));
+        cutlass::TensorRef<ElementInputA, LayoutInputA> a_ref(
+            input.data_ptr<ElementInputA>(), LayoutInputA(input.stride(2), input.stride(1), input.stride(0)));
+        cutlass::TensorRef<ElementInputB, LayoutInputB> b_ref(
+            weight.data_ptr<ElementInputB>(), LayoutInputB(weight.stride(2), weight.stride(1), weight.stride(0)));
+        cutlass::TensorRef<ElementOutput, LayoutOutput> c_ref(
+            bias.valid() ? bias.data_ptr<ElementOutput>() : out.data_ptr<ElementOutput>(), LayoutOutput(0, 0, 0));
+        cutlass::TensorRef<ElementOutput, LayoutOutput> d_ref(
+            out.data_ptr<ElementOutput>(), LayoutOutput(out.stride(2), out.stride(1), out.stride(0)));
+        cutlass::TensorRef<ElementOutput, LayoutOutput> tmpw_ref(
+            tmp_weight.data_ptr<ElementOutput>(),
+            LayoutOutput(tmp_weight.stride(2), tmp_weight.stride(1), tmp_weight.stride(0)));

        typename DeviceKernel::Arguments arguments{
            problem_size,
@@ -315,7 +321,6 @@ Tensor dwconv_f16(Tensor input, Tensor weight, Tensor out, Tensor bias) {
        BufferCUDA workspace(workspace_size);
        auto stream = getCurrentCUDAStream();

-
        cutlass::Status status = implicit_gemm_op.can_implement(arguments);
        if (status != cutlass::Status::kSuccess) {
            throw std::runtime_error("cutlass cannot implement");

--- a/src/kernels/dwconv.h
+++ b/src/kernels/dwconv.h
--- a/src/kernels/gemm_batched.cu
+++ b/src/kernels/gemm_batched.cu
@@ -9,12 +9,10 @@

 using spdlog::fmt_lib::format;

-Tensor gemm_batched_fp16(
-    Tensor a,   // FP16 row-major [(... batch ...), M, K]
+Tensor gemm_batched_fp16(Tensor a,  // FP16 row-major [(... batch ...), M, K]
                         Tensor b,  // FP16 col-major [(... batch ...), N, K]
                         Tensor out // FP32 row-major [(... batch ...), M, N]
-)
-{
+) {
    const int M     = a.shape[-2];
    const int K     = a.shape[-1];
    const int N     = a.shape[-2];
@@ -28,17 +26,22 @@ Tensor gemm_batched_fp16(
    using LayoutO = cutlass::layout::RowMajor;

    using Gemm = cutlass::gemm::device::GemmBatched<
-        ElementInput, LayoutA, 
-        ElementInput, LayoutB,
-        ElementOutput, LayoutO,
+        ElementInput,
+        LayoutA,
+        ElementInput,
+        LayoutB,
        ElementOutput,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+        LayoutO,
+        ElementOutput,
+        cutlass::arch::OpClassTensorOp,
+        cutlass::arch::Sm80,
        cutlass::gemm::GemmShape<32, 32, 64>,
        cutlass::gemm::GemmShape<32, 32, 64>,
        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementOutput, ElementOutput>,
+        cutlass::epilogue::thread::LinearCombination<ElementOutput,
+                                                     128 / cutlass::sizeof_bits<ElementOutput>::value,
+                                                     ElementOutput,
+                                                     ElementOutput>,
        cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
        2>;

@@ -62,15 +65,11 @@ Tensor gemm_batched_fp16(

    cutlass::gemm::GemmCoord problemSize(M, N, K);

-    cutlass::TensorRef<ElementInput, LayoutA> refA(
-        a.data_ptr<ElementInput>(), LayoutA(a.stride(-2)));
-    cutlass::TensorRef<ElementInput, LayoutB> refB(
-        b.data_ptr<ElementInput>(), LayoutB(b.stride(-2)));
-    cutlass::TensorRef<ElementOutput, LayoutO> refO(
-        out.data_ptr<ElementOutput>(), LayoutO(out.stride(-2)));
+    cutlass::TensorRef<ElementInput, LayoutA> refA(a.data_ptr<ElementInput>(), LayoutA(a.stride(-2)));
+    cutlass::TensorRef<ElementInput, LayoutB> refB(b.data_ptr<ElementInput>(), LayoutB(b.stride(-2)));
+    cutlass::TensorRef<ElementOutput, LayoutO> refO(out.data_ptr<ElementOutput>(), LayoutO(out.stride(-2)));

-    typename Gemm::Arguments arguments{
-        problemSize,
+    typename Gemm::Arguments arguments{problemSize,
                                       refA,
                                       (int)a.stride(-3),
                                       refB,
@@ -79,9 +78,8 @@ Tensor gemm_batched_fp16(
                                       (int)out.stride(-3),
                                       refO,
                                       (int)out.stride(-3),
-        { ElementOutput(1), ElementOutput(0) },
-        batch
-    };
+                                       {ElementOutput(1), ElementOutput(0)},
+                                       batch};

    Gemm op;
    BufferCUDA workspace(Gemm::get_workspace_size(arguments));

--- a/src/kernels/gemm_batched.h
+++ b/src/kernels/gemm_batched.h
@@ -3,8 +3,7 @@
 #include "common.h"
 #include "Tensor.h"

-Tensor gemm_batched_fp16(
-    Tensor a,   // FP16 row-major [(... batch ...), M, K]
+Tensor gemm_batched_fp16(Tensor a,  // FP16 row-major [(... batch ...), M, K]
                         Tensor b,  // FP16 col-major [(... batch ...), N, K]
                         Tensor out // FP32 row-major [(... batch ...), M, N]
 );
--- a/src/kernels/gemm_f16.cu
+++ b/src/kernels/gemm_f16.cu
@@ -16,8 +16,7 @@ Tensor gemm_f16(Tensor input,  // FP16
                Tensor weight, // FP16
                Tensor out,    // FP16
                Tensor bias,
-                float alpha
-) {
+                float alpha) {
    auto N = weight.size(0);
    auto K = input.size(-1);
    auto M = input.numel() / K;
@@ -26,7 +25,6 @@ Tensor gemm_f16(Tensor input,  // FP16
    spdlog::debug("gemm_f16: M={} K={} N={}", M, K, N);

    dispatchF16(weight.dtype(), [&]<typename half_t>() {
-
        using ElementOutput          = half_t;
        using ElementAccumulator     = float;
        using ElementComputeEpilogue = half_t;
@@ -38,10 +36,15 @@ Tensor gemm_f16(Tensor input,  // FP16
        using LayoutOutput = cutlass::layout::RowMajor;

        // #if CUDA_ARCH >= 800
-        using Gemm = cutlass::gemm::device::Gemm<
-            ElementInputA, cutlass::layout::RowMajor, ElementInputB, cutlass::layout::ColumnMajor,
-            ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
-            cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75>;
+        using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
+                                                 cutlass::layout::RowMajor,
+                                                 ElementInputB,
+                                                 cutlass::layout::ColumnMajor,
+                                                 ElementOutput,
+                                                 cutlass::layout::RowMajor,
+                                                 ElementAccumulator,
+                                                 cutlass::arch::OpClassTensorOp,
+                                                 cutlass::arch::Sm75>;
        // cutlass::gemm::GemmShape<128, 128, 64>,
        // cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>,
        // cutlass::epilogue::thread::LinearCombination<
@@ -83,17 +86,16 @@ Tensor gemm_f16(Tensor input,  // FP16
        // constexpr int kMetaSizeInBits = Gemm::kMetaSizeInBits;
        cutlass::gemm::GemmCoord problem_size(M, N, K);

-        cutlass::TensorRef<ElementInputA, LayoutInputA> input_ref(
-            input.data_ptr<ElementInputA>(), LayoutInputA(input.stride(-2)));
-        cutlass::TensorRef<ElementInputB, LayoutInputB> weight_ref(
-            weight.data_ptr<ElementInputB>(), LayoutInputB::packed(weight_size));
+        cutlass::TensorRef<ElementInputA, LayoutInputA> input_ref(input.data_ptr<ElementInputA>(),
+                                                                  LayoutInputA(input.stride(-2)));
+        cutlass::TensorRef<ElementInputB, LayoutInputB> weight_ref(weight.data_ptr<ElementInputB>(),
+                                                                   LayoutInputB::packed(weight_size));
        cutlass::TensorRef<ElementOutput, LayoutOutput> bias_ref(
            bias.valid() ? bias.data_ptr<ElementOutput>() : out.data_ptr<ElementOutput>(), LayoutOutput(0));
-        cutlass::TensorRef<ElementOutput, LayoutOutput> out_ref(
-            out.data_ptr<ElementOutput>(), LayoutOutput(out.stride(-2)));
+        cutlass::TensorRef<ElementOutput, LayoutOutput> out_ref(out.data_ptr<ElementOutput>(),
+                                                                LayoutOutput(out.stride(-2)));

-        typename Gemm::Arguments arguments{
-            problem_size, // <- problem size of matrix multiplication
+        typename Gemm::Arguments arguments{problem_size, // <- problem size of matrix multiplication
                                           input_ref,    // <- reference to matrix A on device
                                           weight_ref,   // <- reference to matrix B on device
                                           bias_ref,     // <- reference to matrix C on device
@@ -127,9 +129,7 @@ Tensor gemm_f16(Tensor input,  // FP16
        if (status != cutlass::Status::kSuccess) {
            throw std::runtime_error("cutlass cannot run");
        }
-
    });

-    
    return out;
 }
--- a/src/kernels/gemm_f16.h
+++ b/src/kernels/gemm_f16.h
@@ -3,10 +3,8 @@
 #include "common.h"
 #include "Tensor.h"

-Tensor gemm_f16(
-    Tensor input,  // FP16
+Tensor gemm_f16(Tensor input,  // FP16
                Tensor weight, // FP16
                Tensor out,    // FP16
                Tensor bias,
-    float alpha
-);
\ No newline at end of file
+                float alpha);
--- a/src/kernels/gemm_w8a8.cu
+++ b/src/kernels/gemm_w8a8.cu
@@ -33,45 +33,54 @@ Tensor gemm_w8a8_fp16(Tensor input,  // INT8
    using LayoutInputB = cutlass::layout::ColumnMajor;
    using LayoutOutput = cutlass::layout::RowMajor;

-// #if CUDA_ARCH >= 800
+    // #if CUDA_ARCH >= 800
    using Gemm = cutlass::gemm::device::Gemm<
-        int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
-        ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+        int8_t,
+        cutlass::layout::RowMajor,
+        int8_t,
+        cutlass::layout::ColumnMajor,
+        ElementOutput,
+        cutlass::layout::RowMajor,
+        ElementAccumulator,
+        cutlass::arch::OpClassTensorOp,
+        cutlass::arch::Sm80,
        cutlass::gemm::GemmShape<128, 128, 64>,
-        cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementComputeEpilogue>,
-        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
-// #elif CUDA_ARCH >= 750
-//     using DefaultGemmCfg = cutlass::gemm::device::DefaultGemmConfiguration<
-//         cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
-//         ElementInputA, ElementInputB, ElementOutput, ElementAccumulator>;
-//     using Gemm = cutlass::gemm::device::Gemm<
-//         int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
-//         ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
-//         cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
-//         DefaultGemmCfg::ThreadblockShape, DefaultGemmCfg::WarpShape,
-//         DefaultGemmCfg::InstructionShape,
-//         cutlass::epilogue::thread::LinearCombination<
-//             ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-//             ElementAccumulator, ElementComputeEpilogue>>;
-// #elif CUDA_ARCH >= 700
-//     using DefaultGemmCfg = cutlass::gemm::device::DefaultGemmConfiguration<
-//         cutlass::arch::OpClassSimt, cutlass::arch::Sm70,
-//         ElementInputA, ElementInputB, ElementOutput, ElementAccumulator>;
-//     using Gemm = cutlass::gemm::device::Gemm<
-//         int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
-//         ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
-//         cutlass::arch::OpClassSimt, cutlass::arch::Sm70,
-//         DefaultGemmCfg::ThreadblockShape, DefaultGemmCfg::WarpShape,
-//         DefaultGemmCfg::InstructionShape,
-//         cutlass::epilogue::thread::LinearCombination<
-//             ElementOutput, 1, ElementAccumulator, ElementComputeEpilogue>>;
-// #else
-// #error "Unsupported cuda arch"
-// #endif
+        cutlass::gemm::GemmShape<32, 64, 64>,
+        cutlass::gemm::GemmShape<16, 8, 32>,
+        cutlass::epilogue::thread::LinearCombination<ElementOutput,
+                                                     128 / cutlass::sizeof_bits<ElementOutput>::value,
+                                                     ElementAccumulator,
+                                                     ElementComputeEpilogue>,
+        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+        3>;
+    // #elif CUDA_ARCH >= 750
+    //     using DefaultGemmCfg = cutlass::gemm::device::DefaultGemmConfiguration<
+    //         cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+    //         ElementInputA, ElementInputB, ElementOutput, ElementAccumulator>;
+    //     using Gemm = cutlass::gemm::device::Gemm<
+    //         int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+    //         ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+    //         cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+    //         DefaultGemmCfg::ThreadblockShape, DefaultGemmCfg::WarpShape,
+    //         DefaultGemmCfg::InstructionShape,
+    //         cutlass::epilogue::thread::LinearCombination<
+    //             ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+    //             ElementAccumulator, ElementComputeEpilogue>>;
+    // #elif CUDA_ARCH >= 700
+    //     using DefaultGemmCfg = cutlass::gemm::device::DefaultGemmConfiguration<
+    //         cutlass::arch::OpClassSimt, cutlass::arch::Sm70,
+    //         ElementInputA, ElementInputB, ElementOutput, ElementAccumulator>;
+    //     using Gemm = cutlass::gemm::device::Gemm<
+    //         int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor,
+    //         ElementOutput, cutlass::layout::RowMajor, ElementAccumulator,
+    //         cutlass::arch::OpClassSimt, cutlass::arch::Sm70,
+    //         DefaultGemmCfg::ThreadblockShape, DefaultGemmCfg::WarpShape,
+    //         DefaultGemmCfg::InstructionShape,
+    //         cutlass::epilogue::thread::LinearCombination<
+    //             ElementOutput, 1, ElementAccumulator, ElementComputeEpilogue>>;
+    // #else
+    // #error "Unsupported cuda arch"
+    // #endif

    auto input_size  = cutlass::MatrixCoord(M, K);
    auto weight_size = cutlass::MatrixCoord(K, N);
@@ -105,15 +114,14 @@ Tensor gemm_w8a8_fp16(Tensor input,  // INT8
    // constexpr int kMetaSizeInBits = Gemm::kMetaSizeInBits;
    cutlass::gemm::GemmCoord problem_size(M, N, K);

-    cutlass::TensorRef<ElementInputA, LayoutInputA> input_ref(
-        input.data_ptr<ElementInputA>(), LayoutInputA(input.stride(-2)));
-    cutlass::TensorRef<ElementInputB, LayoutInputB> weight_ref(
-        weight.data_ptr<ElementInputB>(), LayoutInputB::packed(weight_size));
-    cutlass::TensorRef<ElementOutput, LayoutOutput> out_ref(
-        out.data_ptr<ElementOutput>(), LayoutOutput(out.stride(-2)));
+    cutlass::TensorRef<ElementInputA, LayoutInputA> input_ref(input.data_ptr<ElementInputA>(),
+                                                              LayoutInputA(input.stride(-2)));
+    cutlass::TensorRef<ElementInputB, LayoutInputB> weight_ref(weight.data_ptr<ElementInputB>(),
+                                                               LayoutInputB::packed(weight_size));
+    cutlass::TensorRef<ElementOutput, LayoutOutput> out_ref(out.data_ptr<ElementOutput>(),
+                                                            LayoutOutput(out.stride(-2)));

-    typename Gemm::Arguments arguments{
-        problem_size, // <- problem size of matrix multiplication
+    typename Gemm::Arguments arguments{problem_size, // <- problem size of matrix multiplication
                                       input_ref,    // <- reference to matrix A on device
                                       weight_ref,   // <- reference to matrix B on device
                                       out_ref,      // <- reference to matrix C on device

--- a/src/kernels/gemm_w8a8.h
+++ b/src/kernels/gemm_w8a8.h
@@ -7,5 +7,4 @@ Tensor gemm_w8a8_fp16(Tensor input,  // INT8
                      Tensor weight, // INT8
                      Tensor out,
                      half scale,
-                      half bias
-);
\ No newline at end of file
+                      half bias);
--- a/src/kernels/layernorm_kernels.cu
+++ b/src/kernels/layernorm_kernels.cu
@@ -13,13 +13,19 @@ void rms_norm(Tensor &out,    // [..., hidden_size]
    const cudaStream_t stream = getCurrentCUDAStream();
    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
        if (use_quant) {
-      vllm::rms_norm_kernel<scalar_t, int8_t, true><<<grid, block, 0, stream>>>(
-        out.data_ptr<int8_t>(), input.data_ptr<scalar_t>(),
-        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
+            vllm::rms_norm_kernel<scalar_t, int8_t, true><<<grid, block, 0, stream>>>(out.data_ptr<int8_t>(),
+                                                                                      input.data_ptr<scalar_t>(),
+                                                                                      weight.data_ptr<scalar_t>(),
+                                                                                      epsilon,
+                                                                                      num_tokens,
+                                                                                      hidden_size);
        } else {
-      vllm::rms_norm_kernel<scalar_t, scalar_t, false><<<grid, block, 0, stream>>>(
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
-        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
+            vllm::rms_norm_kernel<scalar_t, scalar_t, false><<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                                                         input.data_ptr<scalar_t>(),
+                                                                                         weight.data_ptr<scalar_t>(),
+                                                                                         epsilon,
+                                                                                         num_tokens,
+                                                                                         hidden_size);
        }
    });
 }
@@ -37,12 +43,17 @@ void layernorm_general(Tensor out, Tensor input, Tensor weight, Tensor bias, flo
    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "generalLayerNorm", [&] {
        using T = typename packed_as<scalar_t, 2>::type;
        vllm::generalLayerNorm<T, half, true><<<grid, block, size_shmem, stream>>>(
-        reinterpret_cast<T*>(input.data_ptr<scalar_t>()), 
-        weight.valid() ? reinterpret_cast<T*>(weight.data_ptr<scalar_t>()) : nullptr, 
-        bias.valid() ? reinterpret_cast<T*>(bias.data_ptr<scalar_t>()) : nullptr,
-        reinterpret_cast<T*>(out.data_ptr<scalar_t>()), 
-        epsilon, num_tokens, hidden_size, nullptr, nullptr, nullptr, true
-      );
+            reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+            weight.valid() ? reinterpret_cast<T *>(weight.data_ptr<scalar_t>()) : nullptr,
+            bias.valid() ? reinterpret_cast<T *>(bias.data_ptr<scalar_t>()) : nullptr,
+            reinterpret_cast<T *>(out.data_ptr<scalar_t>()),
+            epsilon,
+            num_tokens,
+            hidden_size,
+            nullptr,
+            nullptr,
+            nullptr,
+            true);
    });
 }

@@ -63,24 +74,36 @@ void rms_norm_general(Tensor &out,    // [..., hidden_size]
        using T = scalar_t;
        if (use_per_token_quant) {
            // per-token
-      vllm::generalLayerNorm<T, half><<<grid, block, 0, stream>>>(
-        reinterpret_cast<T*>(input.data_ptr<scalar_t>()), 
-        reinterpret_cast<T*>(weight.data_ptr<scalar_t>()), nullptr,
-        nullptr, epsilon, num_tokens, hidden_size, nullptr, scaling.data_ptr<half>(),
-        out.data_ptr<int8_t>(), false
-      );
+            vllm::generalLayerNorm<T, half>
+                <<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+                                             reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
+                                             nullptr,
+                                             nullptr,
+                                             epsilon,
+                                             num_tokens,
+                                             hidden_size,
+                                             nullptr,
+                                             scaling.data_ptr<half>(),
+                                             out.data_ptr<int8_t>(),
+                                             false);
            // input, gamma, beta, normed_output, eps, tokens, hidden_dim, per_tensor_scale, per_token_scale
            // normed_output_quant, use_shmem
            // out.data_ptr<int8_t>(), input.data_ptr<scalar_t>(),
            // weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
        } else {
            // per-tensor
-      vllm::generalLayerNorm<T, half><<<grid, block, 0, stream>>>(
-        reinterpret_cast<T*>(input.data_ptr<scalar_t>()), 
-        reinterpret_cast<T*>(weight.data_ptr<scalar_t>()), nullptr,
-        nullptr, epsilon, num_tokens, hidden_size, scaling.data_ptr<half>(), nullptr,
-        out.data_ptr<int8_t>(), false
-      );
+            vllm::generalLayerNorm<T, half>
+                <<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+                                             reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
+                                             nullptr,
+                                             nullptr,
+                                             epsilon,
+                                             num_tokens,
+                                             hidden_size,
+                                             scaling.data_ptr<half>(),
+                                             nullptr,
+                                             out.data_ptr<int8_t>(),
+                                             false);
        }
    });
 }
@@ -103,12 +126,19 @@ void rms_norm_general_fuse_sum(Tensor &out,    // [..., hidden_size]
        using T = scalar_t;
        if (use_per_token_quant) {
            // per-token
-      vllm::generalLayerNorm_fuse_sum<T, half><<<grid, block, 0, stream>>>(
-        reinterpret_cast<T*>(input.data_ptr<scalar_t>()), 
-        reinterpret_cast<T*>(weight.data_ptr<scalar_t>()), nullptr,
-        nullptr, epsilon, num_tokens, hidden_size, input_sum.data_ptr<half>(), nullptr, scaling.data_ptr<half>(),
-        out.data_ptr<int8_t>(), false
-      );
+            vllm::generalLayerNorm_fuse_sum<T, half>
+                <<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+                                             reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
+                                             nullptr,
+                                             nullptr,
+                                             epsilon,
+                                             num_tokens,
+                                             hidden_size,
+                                             input_sum.data_ptr<half>(),
+                                             nullptr,
+                                             scaling.data_ptr<half>(),
+                                             out.data_ptr<int8_t>(),
+                                             false);
            // input, gamma, beta, normed_output, eps, tokens, hidden_dim, per_tensor_scale, per_token_scale
            // normed_output_quant, use_shmem
            // out.data_ptr<int8_t>(), input.data_ptr<scalar_t>(),
@@ -119,20 +149,24 @@ void rms_norm_general_fuse_sum(Tensor &out,    // [..., hidden_size]
            // Not implemented per-tensor input_sum
            assert(false);

-      vllm::generalLayerNorm_fuse_sum<T, half><<<grid, block, 0, stream>>>(
-        reinterpret_cast<T*>(input.data_ptr<scalar_t>()), 
-        reinterpret_cast<T*>(weight.data_ptr<scalar_t>()), nullptr,
-        nullptr, epsilon, num_tokens, hidden_size, nullptr, scaling.data_ptr<half>(), nullptr,
-        out.data_ptr<int8_t>(), false
-      );
+            vllm::generalLayerNorm_fuse_sum<T, half>
+                <<<grid, block, 0, stream>>>(reinterpret_cast<T *>(input.data_ptr<scalar_t>()),
+                                             reinterpret_cast<T *>(weight.data_ptr<scalar_t>()),
+                                             nullptr,
+                                             nullptr,
+                                             epsilon,
+                                             num_tokens,
+                                             hidden_size,
+                                             nullptr,
+                                             scaling.data_ptr<half>(),
+                                             nullptr,
+                                             out.data_ptr<int8_t>(),
+                                             false);
        }
    });
 }

-
-
-void invoke_dequant_add_residual_rms_norm_quant(
-    Tensor &out,      // [..., hidden_size]
+void invoke_dequant_add_residual_rms_norm_quant(Tensor &out,      // [..., hidden_size]
                                                Tensor &input,    // [..., hidden_size]
                                                Tensor &residual, // [..., hidden_size]
                                                Tensor &gamma,    // [hidden_size]
@@ -143,19 +177,20 @@ void invoke_dequant_add_residual_rms_norm_quant(
    dim3 grid(num_tokens);
    dim3 block(std::min(hidden_size, 1024));
    const cudaStream_t stream = getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel",
-      [&] {
+    VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
        vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half, false>
-            <<<grid, block, 0, stream>>>(
-                input.data_ptr<int32_t>(), residual.data_ptr<scalar_t>(),
-                out.data_ptr<int8_t>(), gamma.data_ptr<scalar_t>(), epsilon,
-                scale, num_tokens, hidden_size);
+            <<<grid, block, 0, stream>>>(input.data_ptr<int32_t>(),
+                                         residual.data_ptr<scalar_t>(),
+                                         out.data_ptr<int8_t>(),
+                                         gamma.data_ptr<scalar_t>(),
+                                         epsilon,
+                                         scale,
+                                         num_tokens,
+                                         hidden_size);
    });
 }

-void invoke_dequant_add_residual_rms_norm_quant(
-    Tensor &out,      // [..., hidden_size]
+void invoke_dequant_add_residual_rms_norm_quant(Tensor &out,      // [..., hidden_size]
                                                Tensor &input,    // [..., hidden_size]
                                                Tensor &residual, // [..., hidden_size]
                                                Tensor &gamma,    // [hidden_size]
@@ -168,13 +203,15 @@ void invoke_dequant_add_residual_rms_norm_quant(
    dim3 block(std::min(hidden_size, 1024));

    const cudaStream_t stream = getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel",
-      [&] {
-          vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half*, true>
-            <<<grid, block, 0, stream>>>(
-                input.data_ptr<int32_t>(), residual.data_ptr<scalar_t>(),
-                out.data_ptr<int8_t>(), gamma.data_ptr<scalar_t>(), epsilon,
-                scale.data_ptr<half>(), num_tokens, hidden_size);
+    VLLM_DISPATCH_FLOATING_TYPES(residual.scalar_type(), "dequant_add_residual_rms_norm_quant_kernel", [&] {
+        vllm::dequant_add_residual_rms_norm_quant_kernel<scalar_t, half *, true>
+            <<<grid, block, 0, stream>>>(input.data_ptr<int32_t>(),
+                                         residual.data_ptr<scalar_t>(),
+                                         out.data_ptr<int8_t>(),
+                                         gamma.data_ptr<scalar_t>(),
+                                         epsilon,
+                                         scale.data_ptr<half>(),
+                                         num_tokens,
+                                         hidden_size);
    });
 }
--- a/src/kernels/layernorm_kernels.h
+++ b/src/kernels/layernorm_kernels.h
@@ -7,7 +7,8 @@
 void rms_norm(Tensor &out,    // [num_tokens, hidden_size]
              Tensor &input,  // [num_tokens, hidden_size]
              Tensor &weight, // [hidden_size]
-              float epsilon, bool use_quant);
+              float epsilon,
+              bool use_quant);

 void layernorm_general(Tensor out, Tensor input, Tensor weight, Tensor bias, float epsilon);

@@ -26,15 +27,14 @@ void rms_norm_general_fuse_sum(Tensor &out,    // [..., hidden_size]
                               float epsilon,
                               bool use_per_token_quant);

-void invoke_dequant_add_residual_rms_norm_quant(
-    Tensor &out,      // [..., hidden_size]
+void invoke_dequant_add_residual_rms_norm_quant(Tensor &out,      // [..., hidden_size]
                                                Tensor &input,    // [..., hidden_size]
                                                Tensor &residual, // [..., hidden_size]
                                                Tensor &gamma,    // [hidden_size]
-    half scale, float epsilon);
+                                                half scale,
+                                                float epsilon);

-void invoke_dequant_add_residual_rms_norm_quant(
-    Tensor &out,      // [..., hidden_size]
+void invoke_dequant_add_residual_rms_norm_quant(Tensor &out,      // [..., hidden_size]
                                                Tensor &input,    // [..., hidden_size]
                                                Tensor &residual, // [..., hidden_size]
                                                Tensor &gamma,    // [hidden_size]

--- a/src/kernels/layernorm_kernels_impl.cuh
+++ b/src/kernels/layernorm_kernels_impl.cuh
@@ -5,13 +5,12 @@
 #include "utils.cuh"
 #include "reduction_utils.cuh"

-
 namespace vllm {

 // from TRTLLM
-template <typename Tf, typename T>
-__inline__ __device__ Tf compute_layernorm(Tf val, float s_mean, float s_variance, const T* gamma, const T* beta, int i)
-{
+template<typename Tf, typename T>
+__inline__ __device__ Tf
+compute_layernorm(Tf val, float s_mean, float s_variance, const T *gamma, const T *beta, int i) {
    Tf ret = (val - s_mean) * s_variance;
    if (gamma != nullptr) {
        ret = ret * cuda_cast<Tf>(gamma[i]);
@@ -44,18 +43,25 @@ __inline__ __device__ Tf compute_layernorm(Tf val, float s_mean, float s_varianc
 *           amax per row. A final pass scales to int8 accordingly, and writes output to
 *           normed_output_quant.
 */
-template <typename T, typename scale_type, bool USE_DIFF_OF_SQUARES = false>
-__global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta, T* normed_output, const float eps,
-    int tokens, int hidden_dim, const scale_type* scale_orig_quant_per_tensor, scale_type* scale_orig_quant_per_token,
-    int8_t* normed_output_quant, bool use_shmem)
-{
+template<typename T, typename scale_type, bool USE_DIFF_OF_SQUARES = false>
+__global__ void generalLayerNorm(const T *input,
+                                 const T *gamma,
+                                 const T *beta,
+                                 T *normed_output,
+                                 const float eps,
+                                 int tokens,
+                                 int hidden_dim,
+                                 const scale_type *scale_orig_quant_per_tensor,
+                                 scale_type *scale_orig_quant_per_token,
+                                 int8_t *normed_output_quant,
+                                 bool use_shmem) {
    constexpr auto num_elems_T = num_elems<T>::value;
    using int8_packed_t        = typename packed_as<int8_t, num_elems_T>::type;
    using float_packed_t       = typename packed_as<float, num_elems_T>::type;
    using T_scalar             = typename packed_as<T, 1>::type;

    extern __shared__ __align__(sizeof(float)) char _shmem[];
-    T* shmem = reinterpret_cast<T*>(_shmem);
+    T *shmem = reinterpret_cast<T *>(_shmem);
    __shared__ float s_mean;
    __shared__ float s_variance;

@@ -68,58 +74,47 @@ __global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta,
    float local_var_sum = 0.0f;

    const int n_elems = hidden_dim / num_elems_T;
-    for (int i = tidx; i < n_elems; i += blockDim.x)
-    {
+    for (int i = tidx; i < n_elems; i += blockDim.x) {
        const T val = input[bidx * n_elems + i];
-        if (use_shmem)
-        {
+        if (use_shmem) {
            shmem[i] = val;
        }

        const float_packed_t val_f = cuda_cast<float_packed_t>(val);
        local_sum += cuda_sum<float>(val_f);
-        if (USE_DIFF_OF_SQUARES)
-        {
+        if (USE_DIFF_OF_SQUARES) {
            local_var_sum += cuda_sum<float>(val_f * val_f);
        }
    }

-    if (USE_DIFF_OF_SQUARES)
-    {
+    if (USE_DIFF_OF_SQUARES) {
        float packed[2] = {local_sum, local_var_sum};
        blockReduceSumV2<float, 2>(packed);
        mean     = packed[0];
        variance = packed[1];
-    }
-    else
-    {
+    } else {
        mean = blockReduceSum(local_sum);
    }

-    if (threadIdx.x == 0)
-    {
+    if (threadIdx.x == 0) {
        mean   = mean / hidden_dim;
        s_mean = mean;
-        if (USE_DIFF_OF_SQUARES)
-        {
+        if (USE_DIFF_OF_SQUARES) {
            variance   = (variance / hidden_dim) - (mean * mean); // Var[x] = E[x²] - E[x]²
            s_variance = rsqrtf(variance + eps);
        }
    }
    __syncthreads();

-    if (!USE_DIFF_OF_SQUARES)
-    {
-        for (int i = tidx; i < n_elems; i += blockDim.x)
-        {
+    if (!USE_DIFF_OF_SQUARES) {
+        for (int i = tidx; i < n_elems; i += blockDim.x) {
            const T val         = use_shmem ? shmem[i] : input[bidx * n_elems + i];
            float_packed_t diff = cuda_cast<float_packed_t>(val) - s_mean;
            local_var_sum += cuda_sum<float>(diff * diff);
        }
        variance = blockReduceSum(local_var_sum);

-        if (threadIdx.x == 0)
-        {
+        if (threadIdx.x == 0) {
            s_variance = rsqrtf(variance / hidden_dim + eps);
        }
        __syncthreads();
@@ -127,71 +122,67 @@ __global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta,

    const bool with_per_token_scaling  = scale_orig_quant_per_token != nullptr;
    const bool with_per_tensor_scaling = scale_orig_quant_per_tensor != nullptr;
-    const float_packed_t scale_orig_quant
-        = cuda_cast<float_packed_t>(with_per_tensor_scaling ? __half2float(*scale_orig_quant_per_tensor) : 0.0f);
+    const float_packed_t scale_orig_quant =
+        cuda_cast<float_packed_t>(with_per_tensor_scaling ? __half2float(*scale_orig_quant_per_tensor) : 0.0f);
    T_scalar amax = 1e-6f;

-    for (int i = tidx; i < n_elems; i += blockDim.x)
-    {
+    for (int i = tidx; i < n_elems; i += blockDim.x) {
        const int index            = bidx * n_elems + i;
        const float_packed_t val_f = cuda_cast<float_packed_t>(use_shmem ? shmem[i] : input[index]);
        const T val                = cuda_cast<T>(compute_layernorm(val_f, s_mean, s_variance, gamma, beta, i));

-        if (with_per_token_scaling)
-        {
+        if (with_per_token_scaling) {
            amax = cuda_max(cuda_max<T_scalar, T>(cuda_abs(val)), amax);
-            if (use_shmem)
-            {
+            if (use_shmem) {
                shmem[i] = val;
            }
-        }
-        else if (with_per_tensor_scaling)
-        {
-            reinterpret_cast<int8_packed_t*>(normed_output_quant)[index]
-                = cuda_cast<int8_packed_t>(cuda_cast<float_packed_t>(val) * scale_orig_quant);
-        }
-        else
-        {
+        } else if (with_per_tensor_scaling) {
+            reinterpret_cast<int8_packed_t *>(normed_output_quant)[index] =
+                cuda_cast<int8_packed_t>(cuda_cast<float_packed_t>(val) * scale_orig_quant);
+        } else {
            normed_output[index] = val;
        }
    }

-    if (with_per_token_scaling)
-    {
+    if (with_per_token_scaling) {
        float abs_max_f                     = blockAllReduceMax(cuda_cast<float>(amax));
        const float dynamic_per_token_scale = 127.f / abs_max_f;
-        for (int i = tidx; i < n_elems; i += blockDim.x)
-        {
+        for (int i = tidx; i < n_elems; i += blockDim.x) {
            const int index      = bidx * n_elems + i;
            float_packed_t val_f = cuda_cast<float_packed_t>(use_shmem ? shmem[i] : input[index]);
-            if (!use_shmem)
-            {
+            if (!use_shmem) {
                val_f = compute_layernorm(val_f, s_mean, s_variance, gamma, beta, i);
            }

-            reinterpret_cast<int8_packed_t*>(normed_output_quant)[index]
-                = cuda_cast<int8_packed_t>(val_f * cuda_cast<float_packed_t>(dynamic_per_token_scale));
+            reinterpret_cast<int8_packed_t *>(normed_output_quant)[index] =
+                cuda_cast<int8_packed_t>(val_f * cuda_cast<float_packed_t>(dynamic_per_token_scale));
        }
-        if (tidx == 0)
-        {
+        if (tidx == 0) {
            scale_orig_quant_per_token[bidx] = abs_max_f / 127.f;
        }
    }
 }

-
-template <typename T, typename scale_type, bool USE_DIFF_OF_SQUARES = false>
-__global__ void generalLayerNorm_fuse_sum(const T* input, const T* gamma, const T* beta, T* normed_output, const float eps,
-    int tokens, int hidden_dim, scale_type* input_sum, const scale_type* scale_orig_quant_per_tensor, scale_type* scale_orig_quant_per_token,
-    int8_t* normed_output_quant, bool use_shmem)
-{
+template<typename T, typename scale_type, bool USE_DIFF_OF_SQUARES = false>
+__global__ void generalLayerNorm_fuse_sum(const T *input,
+                                          const T *gamma,
+                                          const T *beta,
+                                          T *normed_output,
+                                          const float eps,
+                                          int tokens,
+                                          int hidden_dim,
+                                          scale_type *input_sum,
+                                          const scale_type *scale_orig_quant_per_tensor,
+                                          scale_type *scale_orig_quant_per_token,
+                                          int8_t *normed_output_quant,
+                                          bool use_shmem) {
    constexpr auto num_elems_T = num_elems<T>::value;
    using int8_packed_t        = typename packed_as<int8_t, num_elems_T>::type;
    using float_packed_t       = typename packed_as<float, num_elems_T>::type;
    using T_scalar             = typename packed_as<T, 1>::type;

    extern __shared__ __align__(sizeof(float)) char _shmem[];
-    T* shmem = reinterpret_cast<T*>(_shmem);
+    T *shmem = reinterpret_cast<T *>(_shmem);
    __shared__ float s_mean;
    __shared__ float s_variance;

@@ -204,58 +195,47 @@ __global__ void generalLayerNorm_fuse_sum(const T* input, const T* gamma, const
    float local_var_sum = 0.0f;

    const int n_elems = hidden_dim / num_elems_T;
-    for (int i = tidx; i < n_elems; i += blockDim.x)
-    {
+    for (int i = tidx; i < n_elems; i += blockDim.x) {
        const T val = input[bidx * n_elems + i];
-        if (use_shmem)
-        {
+        if (use_shmem) {
            shmem[i] = val;
        }

        const float_packed_t val_f = cuda_cast<float_packed_t>(val);
        local_sum += cuda_sum<float>(val_f);
-        if (USE_DIFF_OF_SQUARES)
-        {
+        if (USE_DIFF_OF_SQUARES) {
            local_var_sum += cuda_sum<float>(val_f * val_f);
        }
    }

-    if (USE_DIFF_OF_SQUARES)
-    {
+    if (USE_DIFF_OF_SQUARES) {
        float packed[2] = {local_sum, local_var_sum};
        blockReduceSumV2<float, 2>(packed);
        mean     = packed[0];
        variance = packed[1];
-    }
-    else
-    {
+    } else {
        mean = blockReduceSum(local_sum);
    }

-    if (threadIdx.x == 0)
-    {
+    if (threadIdx.x == 0) {
        mean   = mean / hidden_dim;
        s_mean = mean;
-        if (USE_DIFF_OF_SQUARES)
-        {
+        if (USE_DIFF_OF_SQUARES) {
            variance   = (variance / hidden_dim) - (mean * mean); // Var[x] = E[x²] - E[x]²
            s_variance = rsqrtf(variance + eps);
        }
    }
    __syncthreads();

-    if (!USE_DIFF_OF_SQUARES)
-    {
-        for (int i = tidx; i < n_elems; i += blockDim.x)
-        {
+    if (!USE_DIFF_OF_SQUARES) {
+        for (int i = tidx; i < n_elems; i += blockDim.x) {
            const T val         = use_shmem ? shmem[i] : input[bidx * n_elems + i];
            float_packed_t diff = cuda_cast<float_packed_t>(val) - s_mean;
            local_var_sum += cuda_sum<float>(diff * diff);
        }
        variance = blockReduceSum(local_var_sum);

-        if (threadIdx.x == 0)
-        {
+        if (threadIdx.x == 0) {
            s_variance = rsqrtf(variance / hidden_dim + eps);
        }
        __syncthreads();
@@ -263,70 +243,58 @@ __global__ void generalLayerNorm_fuse_sum(const T* input, const T* gamma, const

    const bool with_per_token_scaling  = scale_orig_quant_per_token != nullptr;
    const bool with_per_tensor_scaling = scale_orig_quant_per_tensor != nullptr;
-    const float_packed_t scale_orig_quant
-        = cuda_cast<float_packed_t>(with_per_tensor_scaling ? __half2float(*scale_orig_quant_per_tensor) : 0.0f);
+    const float_packed_t scale_orig_quant =
+        cuda_cast<float_packed_t>(with_per_tensor_scaling ? __half2float(*scale_orig_quant_per_tensor) : 0.0f);
    T_scalar amax = 1e-6f;
    T_scalar sum  = 0.0f;

-    for (int i = tidx; i < n_elems; i += blockDim.x)
-    {
+    for (int i = tidx; i < n_elems; i += blockDim.x) {
        const int index            = bidx * n_elems + i;
        const float_packed_t val_f = cuda_cast<float_packed_t>(use_shmem ? shmem[i] : input[index]);
        const T val                = cuda_cast<T>(compute_layernorm(val_f, s_mean, s_variance, gamma, beta, i));

-        if (with_per_token_scaling)
-        {
+        if (with_per_token_scaling) {
            amax = cuda_max(cuda_max<T_scalar, T>(cuda_abs(val)), amax);
            sum += cuda_sum<float>(val);
-            if (use_shmem)
-            {
+            if (use_shmem) {
                shmem[i] = val;
            }
-        }
-        else if (with_per_tensor_scaling)
-        {
-            reinterpret_cast<int8_packed_t*>(normed_output_quant)[index]
-                = cuda_cast<int8_packed_t>(cuda_cast<float_packed_t>(val) * scale_orig_quant);
-        }
-        else
-        {
+        } else if (with_per_tensor_scaling) {
+            reinterpret_cast<int8_packed_t *>(normed_output_quant)[index] =
+                cuda_cast<int8_packed_t>(cuda_cast<float_packed_t>(val) * scale_orig_quant);
+        } else {
            normed_output[index] = val;
        }
    }

-    if (with_per_token_scaling)
-    {
+    if (with_per_token_scaling) {
        float abs_max_f                     = blockAllReduceMax(cuda_cast<float>(amax));
        float sum_f                         = blockAllReduceSum(cuda_cast<float>(sum));
        const float dynamic_per_token_scale = 127.f / abs_max_f;
-        for (int i = tidx; i < n_elems; i += blockDim.x)
-        {
+        for (int i = tidx; i < n_elems; i += blockDim.x) {
            const int index      = bidx * n_elems + i;
            float_packed_t val_f = cuda_cast<float_packed_t>(use_shmem ? shmem[i] : input[index]);
-            if (!use_shmem)
-            {
+            if (!use_shmem) {
                val_f = compute_layernorm(val_f, s_mean, s_variance, gamma, beta, i);
            }

-            reinterpret_cast<int8_packed_t*>(normed_output_quant)[index]
-                = cuda_cast<int8_packed_t>(val_f * cuda_cast<float_packed_t>(dynamic_per_token_scale));
+            reinterpret_cast<int8_packed_t *>(normed_output_quant)[index] =
+                cuda_cast<int8_packed_t>(val_f * cuda_cast<float_packed_t>(dynamic_per_token_scale));
        }
-        if (tidx == 0)
-        {
+        if (tidx == 0) {
            scale_orig_quant_per_token[bidx] = abs_max_f / 127.f;
            input_sum[bidx]                  = sum_f;
        }
    }
 }

-
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, typename out_type, bool use_quant>
-__global__ void
-rms_norm_kernel(out_type *__restrict__ out,         // [..., hidden_size]
+template<typename scalar_t, typename out_type, bool use_quant>
+__global__ void rms_norm_kernel(out_type *__restrict__ out,          // [..., hidden_size]
                                const scalar_t *__restrict__ input,  // [..., hidden_size]
                                const scalar_t *__restrict__ weight, // [hidden_size]
-                const float epsilon, const int num_tokens,
+                                const float epsilon,
+                                const int num_tokens,
                                const int hidden_size) {
    __shared__ float s_variance;
    float variance = 0.0f;
@@ -344,23 +312,22 @@ rms_norm_kernel(out_type *__restrict__ out,         // [..., hidden_size]
    for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
        float x = (float)input[blockIdx.x * hidden_size + idx];
        if constexpr (use_quant) {
-      out[blockIdx.x * hidden_size + idx] = float_to_int8_rn(
-        ((float)(x * s_variance)) * (float)(weight[idx]));
+            out[blockIdx.x * hidden_size + idx] = float_to_int8_rn(((float)(x * s_variance)) * (float)(weight[idx]));
        } else {
-      out[blockIdx.x * hidden_size + idx] =
-        ((scalar_t)(x * s_variance)) * weight[idx];
+            out[blockIdx.x * hidden_size + idx] = ((scalar_t)(x * s_variance)) * weight[idx];
        }
    }
 }

-
-
-
-template <typename T, typename scale_type, bool use_per_token_dequant>
-__global__ void dequant_add_residual_rms_norm_quant_kernel(
-    const int32_t *__restrict__ input, T *__restrict__ residual,
-    int8_t *__restrict__ output, const T *__restrict__ gamma,
-    const float layernorm_eps, const scale_type scale, int num_tokens, int hidden_size) {
+template<typename T, typename scale_type, bool use_per_token_dequant>
+__global__ void dequant_add_residual_rms_norm_quant_kernel(const int32_t *__restrict__ input,
+                                                           T *__restrict__ residual,
+                                                           int8_t *__restrict__ output,
+                                                           const T *__restrict__ gamma,
+                                                           const float layernorm_eps,
+                                                           const scale_type scale,
+                                                           int num_tokens,
+                                                           int hidden_size) {
    // layernorm module in the T5 style No bias and no subtraction of mean.
    const int tid = threadIdx.x;

@@ -388,9 +355,8 @@ __global__ void dequant_add_residual_rms_norm_quant_kernel(
    __syncthreads();

    for (int i = tid; i < hidden_size; i += blockDim.x) {
-    output[blockIdx.x * hidden_size + i] = float_to_int8_rn(
-        (((float)(residual[blockIdx.x * hidden_size + i])) * s_variance) *
-        (float)(gamma[i]));
+        output[blockIdx.x * hidden_size + i] =
+            float_to_int8_rn((((float)(residual[blockIdx.x * hidden_size + i])) * s_variance) * (float)(gamma[i]));
    }
 }
 } // namespace vllm
--- a/src/kernels/misc_kernels.cu
+++ b/src/kernels/misc_kernels.cu
@@ -50,8 +50,17 @@ void mul_add(Tensor x, Tensor scale, Tensor bias) {

    dispatch(x.scalar_type(), [&]<typename scalar_t>() {
        if (scale.valid()) {
-            mul_add_kernel<scalar_t, unroll, false><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                x.data_ptr<scalar_t>(), scale.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), 0, x.numel(), scale.numel(), bias.numel(), 0, 0, 0);
+            mul_add_kernel<scalar_t, unroll, false>
+                <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
+                                                                scale.data_ptr<scalar_t>(),
+                                                                bias.data_ptr<scalar_t>(),
+                                                                0,
+                                                                x.numel(),
+                                                                scale.numel(),
+                                                                bias.numel(),
+                                                                0,
+                                                                0,
+                                                                0);
        } else {
            mul_add_kernel<scalar_t, unroll, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
                x.data_ptr<scalar_t>(), nullptr, bias.data_ptr<scalar_t>(), 0, x.numel(), 1, bias.numel(), 0, 0, 0);
@@ -91,17 +100,29 @@ void mul_add_batch(Tensor x, Tensor scale, bool batch_scale, double scale_shift,

    dispatch(x.scalar_type(), [&]<typename scalar_t>() {
        if (scale.valid()) {
-            mul_add_kernel<scalar_t, unroll, false><<<grid, threadsPerBlock, 0, stream>>>(
-                x.data_ptr<scalar_t>(), scale.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), 
+            mul_add_kernel<scalar_t, unroll, false>
+                <<<grid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
+                                                       scale.data_ptr<scalar_t>(),
+                                                       bias.data_ptr<scalar_t>(),
                                                       (scalar_t)scale_shift,
-                numel, numel_scale, numel_bias, 
-                x.stride(0), batch_scale ? scale.stride(0) : 0, batch_bias ? bias.stride(0) : 0);
+                                                       numel,
+                                                       numel_scale,
+                                                       numel_bias,
+                                                       x.stride(0),
+                                                       batch_scale ? scale.stride(0) : 0,
+                                                       batch_bias ? bias.stride(0) : 0);
        } else {
-            mul_add_kernel<scalar_t, unroll, true><<<grid, threadsPerBlock, 0, stream>>>(
-                x.data_ptr<scalar_t>(), nullptr, bias.data_ptr<scalar_t>(), 
+            mul_add_kernel<scalar_t, unroll, true>
+                <<<grid, threadsPerBlock, 0, stream>>>(x.data_ptr<scalar_t>(),
+                                                       nullptr,
+                                                       bias.data_ptr<scalar_t>(),
                                                       (scalar_t)scale_shift,
-                numel, 1, numel_bias, 
-                x.stride(0), 0, batch_bias ? bias.stride(0) : 0);
+                                                       numel,
+                                                       1,
+                                                       numel_bias,
+                                                       x.stride(0),
+                                                       0,
+                                                       batch_bias ? bias.stride(0) : 0);
        }
    });
 }
@@ -134,8 +155,7 @@ Tensor argmax_sample(Tensor logits) {

    dispatch(logits.scalar_type(), [&]<typename scalar_t>() {
        argmax_sample_kernel<<<logits.shape[0], std::min(logits.shape[1], 1024), 0, stream>>>(
-            logits.data_ptr<scalar_t>(), out.data_ptr<int32_t>(), logits.shape[1]
-        );
+            logits.data_ptr<scalar_t>(), out.data_ptr<int32_t>(), logits.shape[1]);
    });

    return out;
@@ -159,16 +179,13 @@ void splitqkv(Tensor qkv, Tensor q, Tensor k, Tensor v) {
    int num_tokens = qkv.numel() / qkv.shape[-1];

    dispatch(qkv.scalar_type(), [&]<typename scalar_t>() {
-        splitqkv_kernel<<<num_tokens, std::min(qkv.shape[-1], 1024), 0, stream>>>(
-            qkv.data_ptr<scalar_t>(),
+        splitqkv_kernel<<<num_tokens, std::min(qkv.shape[-1], 1024), 0, stream>>>(qkv.data_ptr<scalar_t>(),
                                                                                  q.data_ptr<scalar_t>(),
                                                                                  k.data_ptr<scalar_t>(),
                                                                                  v.data_ptr<scalar_t>(),
                                                                                  dim_q,
-            dim_k
-        );
+                                                                                  dim_k);
    });
-
 }

 template<size_t N>
@@ -194,8 +211,7 @@ std::array<Tensor, N> split_mod(Tensor input) {
            outPtr[k] = out[k].template data_ptr<scalar_t>();
        }
        split_mod_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-            input.data_ptr<scalar_t>(),
-            outPtr, input.numel());
+            input.data_ptr<scalar_t>(), outPtr, input.numel());
    });

    return out;
@@ -228,7 +244,6 @@ Tensor quant_static_fuse_gelu(Tensor x, float scale) {

    assert((uintptr_t)x.data_ptr() % (x.scalar_size() * unroll) == 0);

-
    int threadsPerBlock = 1024;
    int blocksPerGrid   = (x.numel() + threadsPerBlock * unroll - 1) / (threadsPerBlock * unroll);

@@ -281,7 +296,6 @@ Tensor topk(Tensor x, int k) {
    outShape[-1]  = k;
    outShape.dataStride.clear();

-
    Tensor out = Tensor::empty(outShape, Tensor::INT32, x.device());

    auto stream = getCurrentCUDAStream();
@@ -294,10 +308,7 @@ Tensor topk(Tensor x, int k) {
        if constexpr (K > 0) {
            dispatch(x.scalar_type(), [&]<typename scalar_t>() {
                topk_kernel<scalar_t, K><<<ceilDiv(batch, 32), 32, 0, stream>>>(
-                    x.data_ptr<scalar_t>(),
-                    out.data_ptr<int>(),
-                    N, x.stride(-2), batch
-                );
+                    x.data_ptr<scalar_t>(), out.data_ptr<int>(), N, x.stride(-2), batch);
                checkCUDA(cudaGetLastError());
            });
        }

--- a/src/kernels/misc_kernels.h
+++ b/src/kernels/misc_kernels.h
--- a/src/kernels/misc_kernels_impl.cuh
+++ b/src/kernels/misc_kernels_impl.cuh
@@ -9,7 +9,6 @@

 namespace nunchaku::kernels {

-
 template<typename T>
 __global__ void add_kernel(T *a, T *b, T *c, size_t length) {
    int i = threadIdx.x + blockIdx.x * blockDim.x;
@@ -24,7 +23,16 @@ struct alignas(sizeof(T) * unroll) Tvec {
 };

 template<typename T, int unroll, bool no_scale>
-__global__ void mul_add_kernel(T *x, T *scale, T *bias, T scale_shift, size_t length, int mod_scale, int mod_bias, int64_t batch_stride_x, int64_t batch_stride_scale, int64_t batch_stride_bias) {
+__global__ void mul_add_kernel(T *x,
+                               T *scale,
+                               T *bias,
+                               T scale_shift,
+                               size_t length,
+                               int mod_scale,
+                               int mod_bias,
+                               int64_t batch_stride_x,
+                               int64_t batch_stride_scale,
+                               int64_t batch_stride_bias) {
    const int batch_id = blockIdx.y;
    int thread         = threadIdx.x + blockIdx.x * blockDim.x;
    int i              = thread * unroll;
@@ -58,16 +66,16 @@ __global__ void mul_add_kernel(T *x, T *scale, T *bias, T scale_shift, size_t le

    *reinterpret_cast<Tvec *>(&x[i + batch_stride_x * batch_id]) = rx;

-// #pragma unroll
-//     for (int k = 0; k < unroll; k++) {
-//         // assert(i < length);
-//         x[i] = x[i] * scale[i_scale] + bias[i_bias];
-//         i++;
-//         i_scale++;
-//         i_bias++;
-//         // assert(i_scale < mod_scale);
-//         // assert(i_bias < mod_bias);
-//     }
+    // #pragma unroll
+    //     for (int k = 0; k < unroll; k++) {
+    //         // assert(i < length);
+    //         x[i] = x[i] * scale[i_scale] + bias[i_bias];
+    //         i++;
+    //         i_scale++;
+    //         i_bias++;
+    //         // assert(i_scale < mod_scale);
+    //         // assert(i_bias < mod_bias);
+    //     }
 }

 template<typename T, size_t N>
@@ -82,7 +90,8 @@ __global__ void split_mod_kernel(T *input, std::array<T *, N> output, size_t len
 }

 template<typename T>
-__global__ void EmbeddingKernel(int32_t *__restrict__ input_id, T *__restrict__ output, T *__restrict__ lookup, int embed_dim) {
+__global__ void
+EmbeddingKernel(int32_t *__restrict__ input_id, T *__restrict__ output, T *__restrict__ lookup, int embed_dim) {
    int i = blockIdx.x;

    int32_t token_id     = input_id[i];
@@ -127,8 +136,8 @@ __global__ void splitqkv_kernel(T *qkv, T *q, T *k, T *v, int q_size, int kv_siz
    }
 }

-template <typename T, int unroll>
-__global__ void quant_kernel_static(const T * input, int8_t * output, T scale, size_t length) {
+template<typename T, int unroll>
+__global__ void quant_kernel_static(const T *input, int8_t *output, T scale, size_t length) {
    int i = (blockIdx.x * blockDim.x + threadIdx.x) * unroll;
    if (i >= length) {
        return;
@@ -149,8 +158,8 @@ __global__ void quant_kernel_static(const T * input, int8_t * output, T scale, s
    *reinterpret_cast<I8vec *>(&output[i]) = routput;
 }

-template <typename T, int unroll>
-__global__ void quant_kernel_static_fuse_gelu(const T * input, int8_t * output, T scale, size_t length) {
+template<typename T, int unroll>
+__global__ void quant_kernel_static_fuse_gelu(const T *input, int8_t *output, T scale, size_t length) {
    int i = (blockIdx.x * blockDim.x + threadIdx.x) * unroll;
    if (i >= length) {
        return;
@@ -196,8 +205,7 @@ __global__ void cast_kernel(const Tin *input, Tout *output, size_t length) {
 // input:  [..., N]
 // output: [..., K] of index in reverse order
 template<typename T, int K>
-__global__
-void topk_kernel(const T *input, int *output, int N, int strideInput, int numRows) {
+__global__ void topk_kernel(const T *input, int *output, int N, int strideInput, int numRows) {
    const int row    = blockIdx.x * blockDim.x + threadIdx.x;
    const int offset = row * strideInput;


--- a/src/kernels/reduction_utils.cuh
+++ b/src/kernels/reduction_utils.cuh
 /*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
 * Copyright (c) 2023, The vLLM team.
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
@@ -18,7 +19,6 @@
 #pragma once
 #define FINAL_MASK 0xffffffff

-
 namespace vllm {

 template<typename T>
@@ -29,17 +29,15 @@ __inline__ __device__ T warpReduceSum(T val) {
    return val;
 }

-template <typename T, int NUM>
-__inline__ __device__ T warpReduceSumV2(T* val)
-{
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceSumV2(T *val) {
 #pragma unroll
-    for (int i = 0; i < NUM; i++)
-    {
+    for (int i = 0; i < NUM; i++) {
 #pragma unroll
        for (int mask = 16; mask > 0; mask >>= 1)
            val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
    }
-    return (T) (0.0f);
+    return (T)(0.0f);
 }

 /* Calculate the sum of all elements in a block */
@@ -84,20 +82,17 @@ __inline__ __device__ T blockAllReduceSum(T val) {
    return val;
 }

-template <typename T, int NUM>
-__inline__ __device__ T blockReduceSumV2(T* val)
-{
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceSumV2(T *val) {
    static __shared__ T shared[NUM][33];
    int lane = threadIdx.x & 0x1f;
    int wid  = threadIdx.x >> 5;

    warpReduceSumV2<T, NUM>(val);

-    if (lane == 0)
-    {
+    if (lane == 0) {
 #pragma unroll
-        for (int i = 0; i < NUM; i++)
-        {
+        for (int i = 0; i < NUM; i++) {
            shared[i][wid] = val[i];
        }
    }
@@ -106,17 +101,15 @@ __inline__ __device__ T blockReduceSumV2(T* val)

    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
 #pragma unroll
-    for (int i = 0; i < NUM; i++)
-    {
-        val[i] = is_mask ? shared[i][lane] : (T) (0.0f);
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
    }
    warpReduceSumV2<T, NUM>(val);
-    return (T) 0.0f;
+    return (T)0.0f;
 }

 template<typename T>
-__inline__ __device__ T warpReduceMax(T val)
-{
+__inline__ __device__ T warpReduceMax(T val) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1)
        val = max(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
@@ -124,8 +117,7 @@ __inline__ __device__ T warpReduceMax(T val)
 }
 /* Calculate the maximum of all elements in a block */
 template<typename T>
-__inline__ __device__ T blockReduceMax(T val)
-{
+__inline__ __device__ T blockReduceMax(T val) {
    static __shared__ T shared[32];
    int lane = threadIdx.x & 0x1f; // in-warp idx
    int wid  = threadIdx.x >> 5;   // warp idx
@@ -141,9 +133,8 @@ __inline__ __device__ T blockReduceMax(T val)
 }

 /* Calculate the maximum of all elements in a block */
-template <typename T>
-__inline__ __device__ T blockAllReduceMax(T val)
-{
+template<typename T>
+__inline__ __device__ T blockAllReduceMax(T val) {
    static __shared__ T shared[32];
    int lane = threadIdx.x & 0x1f; // in-warp idx
    int wid  = threadIdx.x >> 5;   // warp idx
@@ -163,8 +154,4 @@ __inline__ __device__ T blockAllReduceMax(T val)
    return val;
 }

-
-
-
-
 } // namespace vllm
--- a/src/kernels/utils.cuh
+++ b/src/kernels/utils.cuh
-// Adated from FasterTransformer, https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+// Adated from FasterTransformer,
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 #pragma once

 #include <cassert>
@@ -14,8 +15,7 @@
 #include <cuda_bf16.h>
 #endif

-__device__ __forceinline__ 
-static void trap_unsupported_arch() {
+__device__ __forceinline__ static void trap_unsupported_arch() {
    if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0) {
        printf("This kernel is not supported on your GPU\n");
    }
@@ -25,64 +25,143 @@ static void trap_unsupported_arch() {
 }

 #if defined(ENABLE_BF16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-__device__ __forceinline__
-static __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) {
+__device__ __forceinline__ static __nv_bfloat162
+__hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) {
    trap_unsupported_arch();
    return __nv_bfloat162(0.0f, 0.0f);
 }
 #endif

-template<typename T> struct num_elems;
-template <>          struct num_elems<float>           { static constexpr int value = 1; };
-template <>          struct num_elems<float2>          { static constexpr int value = 2; };
-template <>          struct num_elems<float4>          { static constexpr int value = 4; };
-template <>          struct num_elems<half>            { static constexpr int value = 1; };
-template <>          struct num_elems<half2>           { static constexpr int value = 2; };
+template<typename T>
+struct num_elems;
+template<>
+struct num_elems<float> {
+    static constexpr int value = 1;
+};
+template<>
+struct num_elems<float2> {
+    static constexpr int value = 2;
+};
+template<>
+struct num_elems<float4> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<half> {
+    static constexpr int value = 1;
+};
+template<>
+struct num_elems<half2> {
+    static constexpr int value = 2;
+};
 #ifdef ENABLE_BF16
-template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };
-template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };
+template<>
+struct num_elems<__nv_bfloat16> {
+    static constexpr int value = 1;
+};
+template<>
+struct num_elems<__nv_bfloat162> {
+    static constexpr int value = 2;
+};
 #endif
 #ifdef ENABLE_FP8
-template <>          struct num_elems<__nv_fp8_e4m3>   { static constexpr int value = 1; };
-template <>          struct num_elems<__nv_fp8x2_e4m3>  { static constexpr int value = 2; };
+template<>
+struct num_elems<__nv_fp8_e4m3> {
+    static constexpr int value = 1;
+};
+template<>
+struct num_elems<__nv_fp8x2_e4m3> {
+    static constexpr int value = 2;
+};
 #endif

-template<typename T, int num> struct packed_as;
-template<typename T>          struct packed_as<T, 1>              { using type = T; };
-template<>                    struct packed_as<half,  2>          { using type = half2; };
-template<>                    struct packed_as<float,  2>         { using type = float2; };
-template<>                    struct packed_as<int8_t, 2>         { using type = int16_t; };
-template<>                    struct packed_as<int32_t, 2>        { using type = int2; };
-template<>                    struct packed_as<half2, 1>          { using type = half; };
-template<>                    struct packed_as<float2, 1>         { using type = float; };
+template<typename T, int num>
+struct packed_as;
+template<typename T>
+struct packed_as<T, 1> {
+    using type = T;
+};
+template<>
+struct packed_as<half, 2> {
+    using type = half2;
+};
+template<>
+struct packed_as<float, 2> {
+    using type = float2;
+};
+template<>
+struct packed_as<int8_t, 2> {
+    using type = int16_t;
+};
+template<>
+struct packed_as<int32_t, 2> {
+    using type = int2;
+};
+template<>
+struct packed_as<half2, 1> {
+    using type = half;
+};
+template<>
+struct packed_as<float2, 1> {
+    using type = float;
+};
 #ifdef ENABLE_BF16
-template<> struct packed_as<__nv_bfloat16,  2> { using type = __nv_bfloat162; };
-template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16;  };
+template<>
+struct packed_as<__nv_bfloat16, 2> {
+    using type = __nv_bfloat162;
+};
+template<>
+struct packed_as<__nv_bfloat162, 1> {
+    using type = __nv_bfloat16;
+};
 #endif
 #ifdef ENABLE_FP8
-template<> struct packed_as<__nv_fp8_e4m3,  2> { using type = __nv_fp8x2_e4m3; };
-template<> struct packed_as<__nv_fp8x2_e4m3, 1> { using type = __nv_fp8_e4m3;  };
-template<> struct packed_as<__nv_fp8_e5m2,  2> { using type = __nv_fp8x2_e5m2; };
-template<> struct packed_as<__nv_fp8x2_e5m2, 1> { using type = __nv_fp8_e5m2;  };
+template<>
+struct packed_as<__nv_fp8_e4m3, 2> {
+    using type = __nv_fp8x2_e4m3;
+};
+template<>
+struct packed_as<__nv_fp8x2_e4m3, 1> {
+    using type = __nv_fp8_e4m3;
+};
+template<>
+struct packed_as<__nv_fp8_e5m2, 2> {
+    using type = __nv_fp8x2_e5m2;
+};
+template<>
+struct packed_as<__nv_fp8x2_e5m2, 1> {
+    using type = __nv_fp8_e5m2;
+};
 #endif

-inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-inline __device__ float2 operator+(float2 a, float2 b) { return make_float2(a.x + b.x, a.y + b.y); }
-inline __device__ float2 operator-(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+inline __device__ float2 operator*(float2 a, float2 b) {
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __device__ float2 operator+(float2 a, float2 b) {
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __device__ float2 operator-(float2 a, float2 b) {
+    return make_float2(a.x - b.x, a.y - b.y);
+}

-inline __device__ float2 operator*(float2 a, float  b) { return make_float2(a.x * b, a.y * b); }
-inline __device__ float2 operator+(float2 a, float  b) { return make_float2(a.x + b, a.y + b); }
-inline __device__ float2 operator-(float2 a, float  b) { return make_float2(a.x - b, a.y - b); }
+inline __device__ float2 operator*(float2 a, float b) {
+    return make_float2(a.x * b, a.y * b);
+}
+inline __device__ float2 operator+(float2 a, float b) {
+    return make_float2(a.x + b, a.y + b);
+}
+inline __device__ float2 operator-(float2 a, float b) {
+    return make_float2(a.x - b, a.y - b);
+}

-static inline __device__ int8_t float_to_int8_rn(float x)
-{
+static inline __device__ int8_t float_to_int8_rn(float x) {
    uint32_t dst;
    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
-    return reinterpret_cast<const int8_t&>(dst);
+    return reinterpret_cast<const int8_t &>(dst);
 }

 template<typename T>
-inline __device__ T ldg(const T* val) {
+inline __device__ T ldg(const T *val) {
    return __ldg(val);
 }

@@ -90,15 +169,13 @@ inline __device__ T ldg(const T* val) {
 #define bf1622float2 __bfloat1622float2
 #define float22bf162 __float22bfloat162_rn
 #define bf162bf162 __bfloat162bfloat162
-inline __device__ int16_t bf1622int16(__nv_bfloat162 val)
-{
+inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float2 f_val;
    f_val.x = max(min(__low2float(val), 127.f), -128.f);
    f_val.y = max(min(__high2float(val), 127.f), -128.f);

-    union
-    {
+    union {
        int8_t int8[2];
        int16_t int16;
    };
@@ -110,8 +187,7 @@ inline __device__ int16_t bf1622int16(__nv_bfloat162 val)
    val = __hmin2(val, make_bfloat162(127., 127.));
    val = __hmax2(val, make_bfloat162(-128., -128.));

-    union
-    {
+    union {
        int8_t int8[2];
        int16_t int16;
    };
@@ -125,7 +201,7 @@ inline __device__ int16_t bf1622int16(__nv_bfloat162 val)

 #if ENABLE_BF16
 template<>
-inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162* val) {
+inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162 *val) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    return val[0];
 #else
@@ -134,7 +210,7 @@ inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162* val) {
 }

 template<>
-inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16* val) {
+inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16 *val) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    return val[0];
 #else
@@ -143,59 +219,49 @@ inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16* val) {
 }
 #endif // ENABLE_BF16

-template <typename T_OUT, typename T_IN>
-__device__ inline T_OUT cuda_cast(T_IN val)
-{
+template<typename T_OUT, typename T_IN>
+__device__ inline T_OUT cuda_cast(T_IN val) {
    return val;
 }

-template <>
-__device__ inline float2 cuda_cast<float2, int2>(int2 val)
-{
+template<>
+__device__ inline float2 cuda_cast<float2, int2>(int2 val) {
    return make_float2(val.x, val.y);
 }

-template <>
-__device__ inline float2 cuda_cast<float2, float>(float val)
-{
+template<>
+__device__ inline float2 cuda_cast<float2, float>(float val) {
    return make_float2(val, val);
 }

-template <>
-__device__ inline float2 cuda_cast<float2, half2>(half2 val)
-{
+template<>
+__device__ inline float2 cuda_cast<float2, half2>(half2 val) {
    return __half22float2(val);
 }

-template <>
-__device__ inline half2 cuda_cast<half2, float2>(float2 val)
-{
+template<>
+__device__ inline half2 cuda_cast<half2, float2>(float2 val) {
    return __float22half2_rn(val);
 }

-template <>
-__device__ inline half2 cuda_cast<half2, float>(float val)
-{
+template<>
+__device__ inline half2 cuda_cast<half2, float>(float val) {
    return __float2half2_rn(val);
 }

-template <>
-__device__ inline half2 cuda_cast<half2, half>(half val)
-{
+template<>
+__device__ inline half2 cuda_cast<half2, half>(half val) {
    return __half2half2(val);
 }

-template <>
-__device__ inline int8_t cuda_cast<int8_t, half>(half val)
-{
-    union
-    {
+template<>
+__device__ inline int8_t cuda_cast<int8_t, half>(half val) {
+    union {
        int8_t int8[2];
        int16_t int16;
    };

-    union
-    {
+    union {
        half fp16;
        int16_t int16_in;
    };
@@ -205,11 +271,9 @@ __device__ inline int8_t cuda_cast<int8_t, half>(half val)
    return int8[0];
 }

-template <>
-__device__ inline int16_t cuda_cast<int16_t, half2>(half2 val)
-{
-    union
-    {
+template<>
+__device__ inline int16_t cuda_cast<int16_t, half2>(half2 val) {
+    union {
        int8_t int8[2];
        int16_t int16;
    };
@@ -219,11 +283,9 @@ __device__ inline int16_t cuda_cast<int16_t, half2>(half2 val)
    return int16;
 }

-template <>
-__device__ inline int8_t cuda_cast<int8_t, float>(float val)
-{
-    union
-    {
+template<>
+__device__ inline int8_t cuda_cast<int8_t, float>(float val) {
+    union {
        int8_t int8[2];
        int16_t int16;
    };
@@ -232,11 +294,9 @@ __device__ inline int8_t cuda_cast<int8_t, float>(float val)
    return int8[0];
 }

-template <>
-__device__ inline int16_t cuda_cast<int16_t, float2>(float2 val)
-{
-    union
-    {
+template<>
+__device__ inline int16_t cuda_cast<int16_t, float2>(float2 val) {
+    union {
        int8_t int8[2];
        int16_t int16;
    };
@@ -246,11 +306,9 @@ __device__ inline int16_t cuda_cast<int16_t, float2>(float2 val)
    return int16;
 }

-template <>
-__device__ inline half2 cuda_cast<half2, int16_t>(int16_t val)
-{
-    union
-    {
+template<>
+__device__ inline half2 cuda_cast<half2, int16_t>(int16_t val) {
+    union {
        int8_t int8[2];
        int16_t int16;
    };
@@ -259,11 +317,9 @@ __device__ inline half2 cuda_cast<half2, int16_t>(int16_t val)
    return make_half2(int8[0], int8[1]);
 }

-template <>
-__device__ inline float2 cuda_cast<float2, int16_t>(int16_t val)
-{
-    union
-    {
+template<>
+__device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
+    union {
        int8_t int8[2];
        int16_t int16;
    };
@@ -273,83 +329,69 @@ __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val)
 }

 #ifdef ENABLE_BF16
-template <>
-__device__ inline __nv_bfloat16 cuda_cast(int32_t val)
-{
+template<>
+__device__ inline __nv_bfloat16 cuda_cast(int32_t val) {
    return static_cast<float>(val);
 }

-template <>
-__device__ inline __nv_bfloat16 cuda_cast(int8_t val)
-{
+template<>
+__device__ inline __nv_bfloat16 cuda_cast(int8_t val) {
    return static_cast<float>(val);
 }

-template <>
-__device__ inline int8_t cuda_cast(__nv_bfloat16 val)
-{
+template<>
+__device__ inline int8_t cuda_cast(__nv_bfloat16 val) {
    return static_cast<float>(val);
 }

-template <>
-__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val)
-{
+template<>
+__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
    return __bfloat162float(val);
 }

-template <>
-__device__ inline float2 cuda_cast<float2, __nv_bfloat162>(__nv_bfloat162 val)
-{
+template<>
+__device__ inline float2 cuda_cast<float2, __nv_bfloat162>(__nv_bfloat162 val) {
    return bf1622float2(val);
 }

-template <>
-__device__ inline half cuda_cast<half, __nv_bfloat16>(__nv_bfloat16 val)
-{
+template<>
+__device__ inline half cuda_cast<half, __nv_bfloat16>(__nv_bfloat16 val) {
    return __float2half(__bfloat162float(val));
 }

-template <>
-__device__ inline int16_t cuda_cast<int16_t, __nv_bfloat162>(__nv_bfloat162 val)
-{
+template<>
+__device__ inline int16_t cuda_cast<int16_t, __nv_bfloat162>(__nv_bfloat162 val) {
    return bf1622int16(val);
 }

-template <>
-__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val)
-{
+template<>
+__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) {
    return __float2bfloat16(val);
 }

-template <>
-__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, half>(half val)
-{
+template<>
+__device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, half>(half val) {
    return __float2bfloat16(__half2float(val));
 }

-template <>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_bfloat16>(__nv_bfloat16 val)
-{
+template<>
+__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_bfloat16>(__nv_bfloat16 val) {
    return bf162bf162(val);
 }

-template <>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float>(float val)
-{
+template<>
+__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float>(float val) {
    return __float2bfloat162_rn(val);
 }

-template <>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float2>(float2 val)
-{
+template<>
+__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float2>(float2 val) {
    return float22bf162(val);
 }

-template <>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val)
-{
-    union
-    {
+template<>
+__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val) {
+    union {
        int8_t int8[2];
        int16_t int16;
    };
@@ -361,72 +403,57 @@ __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val)
    return res;
 }

-template <>
-__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val)
-{
+template<>
+__device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val) {
    return float22bf162(__half22float2(val));
 }

 #endif // ENABLE BF16

-template <typename f16_t>
-__device__ __forceinline__
-    packed_as<f16_t, 2>::type
-    f162f162(f16_t x);
+template<typename f16_t>
+__device__ __forceinline__ packed_as<f16_t, 2>::type f162f162(f16_t x);

-template <>
-__device__ __forceinline__
-    packed_as<half, 2>::type
-    f162f162<half>(half x)
-{
+template<>
+__device__ __forceinline__ packed_as<half, 2>::type f162f162<half>(half x) {
    return __half2half2(x);
 }

 #ifdef ENABLE_BF16
-template <>
-__device__ __forceinline__
-    packed_as<__nv_bfloat16, 2>::type
-    f162f162<__nv_bfloat16>(__nv_bfloat16 x)
-{
+template<>
+__device__ __forceinline__ packed_as<__nv_bfloat16, 2>::type f162f162<__nv_bfloat16>(__nv_bfloat16 x) {
    return __bfloat162bfloat162(x);
 }
-# endif
+#endif

-template <typename To, typename Ti>
-__device__ inline To cuda_sum(Ti val)
-{
+template<typename To, typename Ti>
+__device__ inline To cuda_sum(Ti val) {
    return cuda_cast<To>(val);
 };

-template <typename To>
-__device__ inline To cuda_sum(float2 val)
-{
+template<typename To>
+__device__ inline To cuda_sum(float2 val) {
    return cuda_cast<To>(val.x + val.y);
 };

 // Unary maximum: compute the max of a vector type
-template <typename To, typename Ti>
-__device__ inline To cuda_max(Ti val)
-{
+template<typename To, typename Ti>
+__device__ inline To cuda_max(Ti val) {
    return cuda_cast<To>(val);
 };

-template <>
-__device__ inline float cuda_max(float2 val)
-{
+template<>
+__device__ inline float cuda_max(float2 val) {
    return fmaxf(val.x, val.y);
 }

-template <>
-__device__ inline half cuda_max(half2 val)
-{
+template<>
+__device__ inline half cuda_max(half2 val) {
    return __hmax(val.x, val.y);
 }

 #ifdef ENABLE_BF16
-template <>
-__device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val)
-{
+template<>
+__device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val) {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
    return __hmax(val.x, val.y);
 #else
@@ -437,55 +464,47 @@ __device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val)
 #endif

 // Binary maximum: compute the max of two scalar types
-template <typename T>
-__device__ inline T cuda_max(T val1, T val2)
-{
+template<typename T>
+__device__ inline T cuda_max(T val1, T val2) {
    return (val1 > val2) ? val1 : val2;
 }

-template <typename T>
-__device__ inline T cuda_abs(T val)
-{
+template<typename T>
+__device__ inline T cuda_abs(T val) {
    assert(false);
    return {};
 }

-template <>
-__device__ inline float cuda_abs(float val)
-{
+template<>
+__device__ inline float cuda_abs(float val) {
    return fabs(val);
 }

-template <>
-__device__ inline float2 cuda_abs(float2 val)
-{
+template<>
+__device__ inline float2 cuda_abs(float2 val) {
    return make_float2(fabs(val.x), fabs(val.y));
 }

-template <>
-__device__ inline half cuda_abs(half val)
-{
+template<>
+__device__ inline half cuda_abs(half val) {
    return __habs(val);
 }

-template <>
-__device__ inline half2 cuda_abs(half2 val)
-{
+template<>
+__device__ inline half2 cuda_abs(half2 val) {
    return __habs2(val);
 }

 #ifdef ENABLE_BF16

 #if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
-template <>
-__device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val)
-{
+template<>
+__device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val) {
    return __habs(val);
 }

-template <>
-__device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val)
-{
+template<>
+__device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) {
    return __habs2(val);
 }
 #endif

--- a/src/kernels/zgemm/attention.cu
+++ b/src/kernels/zgemm/attention.cu
@@ -7,13 +7,11 @@

 namespace nunchaku::kernels {

-void attention_fp16(
-    Tensor q,   // packed [Batch, Head, TokensQ, HEAD_DIM]
+void attention_fp16(Tensor q, // packed [Batch, Head, TokensQ, HEAD_DIM]
                    Tensor k, // packed [Batch, Head, TokensKV, HEAD_DIM]
                    Tensor v, // packed [Batch, Head, TokensKV, HEAD_DIM]
                    Tensor o, // linear [Batch, TokensQ, Head * HEAD_DIM]
-    float scale
-) {
+                    float scale) {
    int sizeBatch   = q.shape[0];
    int numHeads    = q.shape[1];
    int numTokensQ  = q.shape[2];
@@ -55,7 +53,6 @@ void attention_fp16(
        assert(headDim == Attention::HEAD_DIM);

        auto launch = [&]<typename Epilogue>(Epilogue::Arguments args) {
-
            dim3 grid(numTokensQ / Attention::BLOCK_M, numHeads, sizeBatch);
            using packed_q_t = typename Attention::packed_q_t;
            using packed_k_t = typename Attention::packed_k_t;
@@ -66,7 +63,8 @@ void attention_fp16(
                                      const packed_k_t *,
                                      const packed_v_t *,
                                      float,
-                int, int,
+                                      int,
+                                      int,
                                      typename Epilogue::Arguments,
                                      bool>;

@@ -76,15 +74,14 @@ void attention_fp16(
                checkCUDA(cudaFuncSetAttribute(func, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
            }

-            func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem, getCurrentCUDAStream()>>>(
-                q.data_ptr<packed_q_t>(),
+            func<<<grid, GEMM::WARP_SIZE * GEMM::NUM_WARPS, shmem, getCurrentCUDAStream()>>>(q.data_ptr<packed_q_t>(),
                                                                                             k.data_ptr<packed_k_t>(),
                                                                                             v.data_ptr<packed_v_t>(),
                                                                                             scale,
-                numTokensQ, numTokensKV,
+                                                                                             numTokensQ,
+                                                                                             numTokensKV,
                                                                                             args,
-                false
-            );
+                                                                                             false);
            checkCUDA(cudaGetLastError());
        };

@@ -94,8 +91,6 @@ void attention_fp16(
            .actualN = numHeads * headDim,
        });
    });
-
-    
 }

 }; // namespace nunchaku::kernels
--- a/src/kernels/zgemm/attention.cuh
+++ b/src/kernels/zgemm/attention.cuh
@@ -88,7 +88,8 @@ public:

    static constexpr int WARP_M_TILES = WARP_M / INSN_M;
    static constexpr int WARP_N_TILES = WARP_N / INSN_N;
-    static constexpr int WARP_K_TILES_QK = WARP_K / INSN_N;  // when multiplying Q*K, K is on dimension of N in MMA instruction
+    static constexpr int WARP_K_TILES_QK =
+        WARP_K / INSN_N; // when multiplying Q*K, K is on dimension of N in MMA instruction
    static constexpr int WARP_K_TILES_PV = WARP_K / INSN_K_PV;
    static constexpr int WARP_D_TILES    = WARP_D / INSN_K_QK;

@@ -120,8 +121,7 @@ public:
        int numBatch;
    };

-    __device__ __forceinline__
-    static packed_fpsum_t packed_fp32_to_fp16(packed_f32psum_t input) {
+    __device__ __forceinline__ static packed_fpsum_t packed_fp32_to_fp16(packed_f32psum_t input) {
        std::array<half2_t, 4> results;
        for (int i = 0; i < 4; i++) {
            results[i] = float22half2<half2_t>(float2(input.data[i * 2], input.data[i * 2 + 1]));
@@ -129,8 +129,7 @@ public:
        return kernels::bit_cast<packed_fpsum_t>(results);
    }

-    __device__ __forceinline__
-    static packed_f32psum_t packed_fp16_to_fp32(packed_fpsum_t input) {
+    __device__ __forceinline__ static packed_f32psum_t packed_fp16_to_fp32(packed_fpsum_t input) {
        auto arr = kernels::bit_cast<std::array<half2_t, 4>>(input);
        packed_f32psum_t results;
        for (int i = 0; i < 4; i++) {
@@ -142,8 +141,7 @@ public:
    }

    // q: [batch, head, bm, NUM_WARPS, WARP_M_TILES, WARP_D_TILES, WARP_SIZE] of packed_q_t
-    __device__ __forceinline__ 
-    static void load_q(const packed_q_t *ptr, q_warp &out, bool pred) {
+    __device__ __forceinline__ static void load_q(const packed_q_t *ptr, q_warp &out, bool pred) {
        const int laneId = threadIdx.x % WARP_SIZE;
        const int warpId = threadIdx.x / WARP_SIZE;

@@ -157,8 +155,7 @@ public:
    }

    // k: [batch, head, ktile, WARP_K_TILES_QK, WARP_D_TILES, WARP_SIZE] of packed_k_t
-    __device__ __forceinline__ 
-    static void load_k(const packed_k_t *ptr, int ktile, k_warp &out, bool pred) {
+    __device__ __forceinline__ static void load_k(const packed_k_t *ptr, int ktile, k_warp &out, bool pred) {
        const int laneId = threadIdx.x % WARP_SIZE;
        const int warpId = threadIdx.x / WARP_SIZE;

@@ -172,8 +169,7 @@ public:
    }

    // v: [batch, head, ktile, WARP_K_TILES_PV, WARP_N_TILES, WARP_SIZE] of packed_v_t
-    __device__ __forceinline__ 
-    static void load_v(const packed_v_t *ptr, int ktile, v_warp &out, bool pred) {
+    __device__ __forceinline__ static void load_v(const packed_v_t *ptr, int ktile, v_warp &out, bool pred) {
        const int laneId = threadIdx.x % WARP_SIZE;
        const int warpId = threadIdx.x / WARP_SIZE;

@@ -186,16 +182,15 @@ public:
        });
    }

-    __device__ __forceinline__
-    static packed_fpsum_t mma_f16xf16_f16(packed_fpsum_t a, packed_fpsum_t b, packed_fpsum_t psum) {
+    __device__ __forceinline__ static packed_fpsum_t
+    mma_f16xf16_f16(packed_fpsum_t a, packed_fpsum_t b, packed_fpsum_t psum) {
        uint2 out1 = mma_m16n8k16_f16f16f16f16(a, uint2(b.x, b.y), uint2(psum.x, psum.y));
        uint2 out2 = mma_m16n8k16_f16f16f16f16(a, uint2(b.z, b.w), uint2(psum.z, psum.w));
        return packed_fpsum_t{out1.x, out1.y, out2.x, out2.y};
    }

    // set nan values to -inf
-    __device__ __forceinline__
-    static half2_t fix_nan(half2_t input) {
+    __device__ __forceinline__ static half2_t fix_nan(half2_t input) {
        static constexpr float neginf = -std::numeric_limits<float>::infinity();
        /**
         * In accordance to the IEEE-754R standard,
@@ -206,14 +201,12 @@ public:
        return __hmax2(input, half2_t(neginf, neginf));
    }

-    __device__ __forceinline__
-    static float fix_nan(float input) {
+    __device__ __forceinline__ static float fix_nan(float input) {
        static constexpr float neginf = -std::numeric_limits<float>::infinity();
        return fmaxf(input, neginf);
    }

-    __device__ __forceinline__
-    static packed_fpsum_t fix_nan(packed_fpsum_t input) {
+    __device__ __forceinline__ static packed_fpsum_t fix_nan(packed_fpsum_t input) {
        input.x = kernels::bit_cast<int>(fix_nan(kernels::bit_cast<half2_t>(input.x)));
        input.y = kernels::bit_cast<int>(fix_nan(kernels::bit_cast<half2_t>(input.y)));
        input.z = kernels::bit_cast<int>(fix_nan(kernels::bit_cast<half2_t>(input.z)));
@@ -221,30 +214,28 @@ public:
        return input;
    }

-    __device__ __forceinline__
-    static packed_f32psum_t fix_nan(packed_f32psum_t input) {
-    #pragma unroll
+    __device__ __forceinline__ static packed_f32psum_t fix_nan(packed_f32psum_t input) {
+#pragma unroll
        for (int i = 0; i < 8; i++) {
            input.data[i] = fix_nan(input.data[i]);
        }
        return input;
    }

-    __device__ __forceinline__
-    static qk_warp compute_qk(q_warp Q, k_warp K) {
+    __device__ __forceinline__ static qk_warp compute_qk(q_warp Q, k_warp K) {
        qk_warp QK;
-    #pragma unroll
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
-    #pragma unroll
+#pragma unroll
            for (int k = 0; k < WARP_K_TILES_QK; k++) {

 #if 0
-            #pragma unroll
+#pragma unroll
                for (int d = 0; d < WARP_D_TILES; d++) {
                    packed_fpsum_t psum = make_uint4(0, 0, 0, 0);
                    psum = mma_f16xf16_f16(Q[m * WARP_D_TILES + d], K[k * WARP_D_TILES + d], psum);
                    auto f32psum = packed_fp16_to_fp32(psum);
-            #pragma unroll
+#pragma unroll
                    for (int i = 0; i < 8; i++) {
                        QK[m * WARP_K_TILES_QK + k].data[i] += f32psum.data[i];
                    }
@@ -252,7 +243,7 @@ public:

 #else
                packed_fpsum_t psum = make_uint4(0, 0, 0, 0);
-            #pragma unroll
+#pragma unroll
                for (int d = 0; d < WARP_D_TILES; d++) {
                    psum = mma_f16xf16_f16(Q[m * WARP_D_TILES + d], K[k * WARP_D_TILES + d], psum);
                }
@@ -264,18 +255,16 @@ public:
                    QK[m * WARP_K_TILES_QK + k] = fix_nan(packed_fp16_to_fp32(psum));
                }
 #endif
-                
            }
        }
        return QK;
    }

-    __device__ __forceinline__
-    static rowval_warp compute_rowmax(qk_warp QK, rowval_warp rowmax, float scale) {
-    #pragma unroll
+    __device__ __forceinline__ static rowval_warp compute_rowmax(qk_warp QK, rowval_warp rowmax, float scale) {
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
            float2 maxv;
-    #pragma unroll
+#pragma unroll
            for (int k = 0; k < WARP_K_TILES_QK; k++) {
                packed_f32psum_t &val = QK[m * WARP_K_TILES_QK + k];
                float x               = fmaxf(fmaxf(val.data[0], val.data[1]), fmaxf(val.data[4], val.data[5]));
@@ -287,7 +276,7 @@ public:
                    maxv.y = fmaxf(maxv.y, y);
                }
            }
-    #pragma unroll
+#pragma unroll
            for (int mask = 1; mask <= 2; mask *= 2) {
                maxv.x = fmaxf(maxv.x, __shfl_xor_sync(~0, maxv.x, mask));
                maxv.y = fmaxf(maxv.y, __shfl_xor_sync(~0, maxv.y, mask));
@@ -298,12 +287,11 @@ public:
        return rowmax;
    }

-    __device__ __forceinline__
-    static qk_warp softmax(qk_warp QK, rowval_warp rowmax_scaled, float scale) {
-    #pragma unroll
+    __device__ __forceinline__ static qk_warp softmax(qk_warp QK, rowval_warp rowmax_scaled, float scale) {
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
            float2 shift = rowmax_scaled[m];
-    #pragma unroll
+#pragma unroll
            for (int k = 0; k < WARP_K_TILES_QK; k++) {
                packed_f32psum_t &val = QK[m * WARP_K_TILES_QK + k];
                val.data[0]           = cuda_exp2(fmaf(val.data[0], scale, -shift.x));
@@ -319,19 +307,18 @@ public:
        return QK;
    }

-    __device__ __forceinline__
-    static rowval_warp compute_rowsum(qk_warp QK) {
+    __device__ __forceinline__ static rowval_warp compute_rowsum(qk_warp QK) {
        rowval_warp rowsum;
-    #pragma unroll
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
            float2 sumv = make_float2(0.0f, 0.0f);
-    #pragma unroll
+#pragma unroll
            for (int k = 0; k < WARP_K_TILES_QK; k++) {
                packed_f32psum_t &val = QK[m * WARP_K_TILES_QK + k];
                sumv.x += val.data[0] + val.data[1] + val.data[4] + val.data[5];
                sumv.y += val.data[2] + val.data[3] + val.data[6] + val.data[7];
            }
-    #pragma unroll
+#pragma unroll
            for (int mask = 1; mask <= 2; mask *= 2) {
                sumv.x += __shfl_xor_sync(~0, sumv.x, mask);
                sumv.y += __shfl_xor_sync(~0, sumv.y, mask);
@@ -341,10 +328,9 @@ public:
        return rowsum;
    }

-    __device__ __forceinline__
-    static rowval_warp compute_rescale(rowval_warp rowmax0, rowval_warp rowmax1) {
+    __device__ __forceinline__ static rowval_warp compute_rescale(rowval_warp rowmax0, rowval_warp rowmax1) {
        rowval_warp rescale;
-    #pragma unroll
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
            rescale[m].x = cuda_exp2(rowmax0[m].x - rowmax1[m].x);
            rescale[m].y = cuda_exp2(rowmax0[m].y - rowmax1[m].y);
@@ -352,14 +338,13 @@ public:
        return rescale;
    }

-    __device__ __forceinline__
-    static o_warp compute_pv(p_warp P, v_warp V, o_warp O, rowval_warp rescale) {
-    #pragma unroll
+    __device__ __forceinline__ static o_warp compute_pv(p_warp P, v_warp V, o_warp O, rowval_warp rescale) {
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
-    #pragma unroll
+#pragma unroll
            for (int n = 0; n < WARP_N_TILES; n++) {
                packed_fpsum_t psum = make_uint4(0, 0, 0, 0);
-    #pragma unroll
+#pragma unroll
                for (int k = 0; k < WARP_K_TILES_PV; k++) {
                    psum = mma_f16xf16_f16(P[m * WARP_K_TILES_PV + k], V[n * WARP_K_TILES_PV + k], psum);
                }
@@ -379,9 +364,8 @@ public:
        return O;
    }

-    __device__ __forceinline__
-    static rowval_warp compute_l(rowval_warp L, rowval_warp rescale, rowval_warp rowsum) {
-    #pragma unroll
+    __device__ __forceinline__ static rowval_warp compute_l(rowval_warp L, rowval_warp rescale, rowval_warp rowsum) {
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
            L[m].x = fmaf(L[m].x, rescale[m].x, rowsum[m].x);
            L[m].y = fmaf(L[m].y, rescale[m].y, rowsum[m].y);
@@ -389,13 +373,12 @@ public:
        return L;
    }

-    __device__ __forceinline__
-    static p_warp qk_to_p(qk_warp QK) {
+    __device__ __forceinline__ static p_warp qk_to_p(qk_warp QK) {
        static_assert(WARP_K_TILES_QK == WARP_K_TILES_PV);
        p_warp P;
-    #pragma unroll
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
-    #pragma unroll
+#pragma unroll
            for (int k = 0; k < WARP_K_TILES_PV; k++) {
                P[m * WARP_K_TILES_PV + k] = packed_fp32_to_fp16(QK[m * WARP_K_TILES_QK + k]);
            }
@@ -416,8 +399,8 @@ public:
    //     O = compute_pv(P, V, O, rescale);
    // }

-    __device__ __forceinline__
-    static std::tuple<p_warp, rowval_warp> compute(q_warp Q, k_warp K, rowval_warp &M, rowval_warp &L, float scale) {
+    __device__ __forceinline__ static std::tuple<p_warp, rowval_warp>
+    compute(q_warp Q, k_warp K, rowval_warp &M, rowval_warp &L, float scale) {
        qk_warp qk          = compute_qk(Q, K);
        rowval_warp M1      = compute_rowmax(qk, M, scale);
        qk                  = softmax(qk, M1, scale);
@@ -429,14 +412,13 @@ public:
        return {P, rescale};
    }

-    __device__ __forceinline__
-    static o_warp compute_o(o_warp O, rowval_warp L) {
-    #pragma unroll
+    __device__ __forceinline__ static o_warp compute_o(o_warp O, rowval_warp L) {
+#pragma unroll
        for (int m = 0; m < WARP_M_TILES; m++) {
            float2 inv;
            inv.x = cuda_frcp(L[m].x);
            inv.y = cuda_frcp(L[m].y);
-    #pragma unroll
+#pragma unroll
            for (int n = 0; n < WARP_N_TILES; n++) {
                packed_f32psum_t &oval = O[m * WARP_N_TILES + n];
                oval.data[0]           = oval.data[0] * inv.x;
@@ -452,7 +434,6 @@ public:
        return O;
    }

-
 #if 0
    template<typename Epilogue>
    __device__ __forceinline__
@@ -485,9 +466,9 @@ public:
            load_v(ptr_v, k, V[k], true);
        }

-    #pragma unroll
+#pragma unroll
        for (auto &pack : O) {
-    #pragma unroll
+#pragma unroll
            for (int i = 0; i < 8; i++) {
                pack.data[i] = 0;
            }
@@ -498,7 +479,7 @@ public:
        M.fill(make_float2(neginf, neginf));

        __shared__ q_warp Q_shmem[NUM_WARPS];
-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < Q.size(); i++) {
            store<true>(&Q_shmem[warpId][i], Q[i]);
        }
@@ -507,9 +488,9 @@ public:

        // TODO: mask tokens in last block
        for (int k1 = 0; k1 < ntokens_kv / WARP_K; k1 += NUM_STAGES) {
-    #pragma unroll
+#pragma unroll
            for (int k2 = 0; k2 < NUM_STAGES; k2++) {
-    #pragma unroll
+#pragma unroll
                for (int i = 0; i < Q.size(); i++) {
                    Q[i] = load<true>(&Q_shmem[warpId][i]);
                }
@@ -549,9 +530,7 @@ public:
    }
 #else
    template<typename Epilogue>
-    __device__ __forceinline__
-    static void attention_fp16_block(
-        const BlockInfo binfo,
+    __device__ __forceinline__ static void attention_fp16_block(const BlockInfo binfo,
                                                                const packed_q_t *ptr_q,
                                                                const packed_k_t *ptr_k,
                                                                const packed_v_t *ptr_v,
@@ -559,8 +538,7 @@ public:
                                                                int ntokens_q,
                                                                int ntokens_kv,
                                                                Epilogue::Arguments epilogueArgs,
-        bool alwaysfalse) 
-    {
+                                                                bool alwaysfalse) {
        // constexpr int NUM_STAGES = 2;

        const int laneId = threadIdx.x % WARP_SIZE;
@@ -576,15 +554,16 @@ public:
        load_q(ptr_q, Q, true);
        load_k(ptr_k, 0, K, true);

-    #pragma unroll
+#pragma unroll
        for (auto &pack : O) {
-    #pragma unroll
+#pragma unroll
            for (int i = 0; i < 8; i++) {
                pack.data[i] = 0;
            }
        }

-        static constexpr float neginf = -std::numeric_limits<float>::max(); // not real inf, to prevent nan during computation
+        static constexpr float neginf =
+            -std::numeric_limits<float>::max(); // not real inf, to prevent nan during computation
        L.fill(make_float2(0.0f, 0.0f));
        M.fill(make_float2(neginf, neginf));

@@ -593,7 +572,7 @@ public:
        using q_shmem_t = packed_q_t[NUM_WARPS][SHMEM_TILES][WARP_SIZE];
        __shared__ q_shmem_t Q_shmem;

-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < SHMEM_TILES; i++) {
            store<true>(&Q_shmem[warpId][i][laneId], Q[Q.size() - 1 - i]);
        }
@@ -602,12 +581,12 @@ public:
        int dummy = 0;

        // TODO: mask tokens in last block
-        for (int k1 = 0; k1 < ntokens_kv / WARP_K; k1 ++) {
+        for (int k1 = 0; k1 < ntokens_kv / WARP_K; k1++) {
            if (alwaysfalse) {
                ptr_v += K[0].x;
            }

-        #pragma unroll
+#pragma unroll
            for (int i = 0; i < SHMEM_TILES; i++) {
                Q[Q.size() - 1 - i] = load<true>(&Q_shmem[warpId][i][laneId]);
            }
@@ -628,8 +607,6 @@ public:
                dummy = clock();
            }

-        
-
            auto [P, rescale] = compute(Q, K, M, L, scale);

            if (alwaysfalse) {
@@ -644,9 +621,7 @@ public:
            //     dummy = clock();
            // }

-            load_k(ptr_k, k1+1, K, k1+1 < ntokens_kv / WARP_K);
-
-            
+            load_k(ptr_k, k1 + 1, K, k1 + 1 < ntokens_kv / WARP_K);

            // if (alwaysfalse) {
            //     dummy = clock();
@@ -665,12 +640,18 @@ public:

        auto f16psum = GEMM::packed_fp32_to_fp16(O);

-        Epilogue()(typename GEMM::BlockInfo{
+        Epilogue()(
+            typename GEMM::BlockInfo{
                .bm         = binfo.batch * binfo.numBlocksM + binfo.bm,
                .bn         = binfo.head,
                .numBlocksM = binfo.numBatch * binfo.numBlocksM,
                .numBlocksN = binfo.numHeads,
-        }, f16psum, binfo.numBatch * binfo.numBlocksM * BLOCK_M, binfo.numHeads * HEAD_DIM, 0, epilogueArgs);
+            },
+            f16psum,
+            binfo.numBatch * binfo.numBlocksM * BLOCK_M,
+            binfo.numHeads * HEAD_DIM,
+            0,
+            epilogueArgs);
    }
 #endif

@@ -679,17 +660,14 @@ public:
        static constexpr int MIN_ARCH   = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
        static constexpr int SHMEM_SIZE = 0; // sizeof(q_shmem_t);

-        __device__
-        void operator()(
-            const packed_q_t *ptr_q,
+        __device__ void operator()(const packed_q_t *ptr_q,
                                   const packed_k_t *ptr_k,
                                   const packed_v_t *ptr_v,
                                   float scale,
                                   int ntokens_q,
                                   int ntokens_kv,
                                   Epilogue::Arguments epilogueArgs,
-            bool alwaysfalse) 
-        {
+                                   bool alwaysfalse) {
            BlockInfo binfo = {
                .bm         = (int)blockIdx.x,
                .head       = (int)blockIdx.y,
@@ -706,21 +684,20 @@ public:

            attention_fp16_block<Epilogue>(
                binfo,
-                ptr_q + ((binfo.batch * binfo.numHeads + binfo.head) * binfo.numBlocksM + binfo.bm) * NUM_WARPS * WARP_M_TILES * WARP_D_TILES * WARP_SIZE,
-                ptr_k + (binfo.batch * binfo.numHeads + binfo.head) * ktiles * WARP_K_TILES_QK * WARP_D_TILES * WARP_SIZE,
-                ptr_v + (binfo.batch * binfo.numHeads + binfo.head) * ktiles * WARP_K_TILES_PV * WARP_N_TILES * WARP_SIZE,
+                ptr_q + ((binfo.batch * binfo.numHeads + binfo.head) * binfo.numBlocksM + binfo.bm) * NUM_WARPS *
+                            WARP_M_TILES * WARP_D_TILES * WARP_SIZE,
+                ptr_k +
+                    (binfo.batch * binfo.numHeads + binfo.head) * ktiles * WARP_K_TILES_QK * WARP_D_TILES * WARP_SIZE,
+                ptr_v +
+                    (binfo.batch * binfo.numHeads + binfo.head) * ktiles * WARP_K_TILES_PV * WARP_N_TILES * WARP_SIZE,
                scale,
                ntokens_q,
                ntokens_kv,
                // *Q_shmem,
                epilogueArgs,
-                alwaysfalse
-            );
+                alwaysfalse);
        }
    };
 };

-
-
 }; // namespace nunchaku::kernels
-
--- a/src/kernels/zgemm/epilogues.cuh
+++ b/src/kernels/zgemm/epilogues.cuh
@@ -19,19 +19,20 @@ public:
    IMPORT_GEMM_BASE(Config);

 public:
-
    struct EpilogueGelu {
-        struct Arguments { size_t unused; };
+        struct Arguments {
+            size_t unused;
+        };

        // static constexpr float SHIFT_VALUE = 0.171875f;

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
-        #pragma unroll
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
+#pragma unroll
            for (int i = 0; i < WARP_M_TILES; i++) {
-        #pragma unroll
+#pragma unroll
                for (int j = 0; j < WARP_N_TILES; j++) {
-        #pragma unroll
+#pragma unroll
                    for (int k = 0; k < 4; k++) {
                        half2_t &data = fpsum[i * WARP_N_TILES + j].data[k];
                        data          = gelu_half2(data);
@@ -64,8 +65,16 @@ public:

        static constexpr int ROTARY_EMB_NUM_ELEMENTS = 2; // 1 for theta, 2 for {sin, cos} pair

-        __device__ __forceinline__
-        static void apply(fpsum_warp fpsum, half_t *out, int M, int N, int K, half_t *pool_out, const float *rotary_emb, const half_t *rmsnorm_weight, float epsilon, int maxRows) {
+        __device__ __forceinline__ static void apply(fpsum_warp fpsum,
+                                                     half_t *out,
+                                                     int M,
+                                                     int N,
+                                                     int K,
+                                                     half_t *pool_out,
+                                                     const float *rotary_emb,
+                                                     const half_t *rmsnorm_weight,
+                                                     float epsilon,
+                                                     int maxRows) {
            const int laneId = threadIdx.x % WARP_SIZE;
            const int warpId = threadIdx.x / WARP_SIZE;

@@ -91,16 +100,25 @@ public:
                }
            }

-            const float *rotary_emb_base_addr = &rotary_emb[(warpId * WARP_M) * HEAD_DIM / 2 * ROTARY_EMB_NUM_ELEMENTS + laneId * PACK_SIZE / 2 * ROTARY_EMB_NUM_ELEMENTS];
+            const float *rotary_emb_base_addr = &rotary_emb[(warpId * WARP_M) * HEAD_DIM / 2 * ROTARY_EMB_NUM_ELEMENTS +
+                                                            laneId * PACK_SIZE / 2 * ROTARY_EMB_NUM_ELEMENTS];

            CHECK_NAN(fpsum, "fpsum");

-            unpack_fpsum()(fpsum, out + warpId * WARP_M * N, N, maxRows - warpId * WARP_M, INT_MAX, shmem[warpId], [&](int rowId, pack_t &pack) ALWAYSINLINE {
+            unpack_fpsum()(fpsum,
+                           out + warpId * WARP_M * N,
+                           N,
+                           maxRows - warpId * WARP_M,
+                           INT_MAX,
+                           shmem[warpId],
+                           [&](int rowId, pack_t &pack) ALWAYSINLINE {
                               // load rope
                               pack_rope_t rope;
                               if (laneId < LANES_PER_HEAD) {
-                    // freq = load(reinterpret_cast<pack_freq_t *>(&freqs_cis[(warpId * WARP_M + rowId) * HEAD_DIM * 2 + laneId * PACK_SIZE * 2]));
-                    rope = load(reinterpret_cast<const pack_rope_t *>(&rotary_emb_base_addr[rowId * HEAD_DIM / 2 * ROTARY_EMB_NUM_ELEMENTS]));
+                                   // freq = load(reinterpret_cast<pack_freq_t *>(&freqs_cis[(warpId * WARP_M + rowId) *
+                                   // HEAD_DIM * 2 + laneId * PACK_SIZE * 2]));
+                                   rope = load(reinterpret_cast<const pack_rope_t *>(
+                                       &rotary_emb_base_addr[rowId * HEAD_DIM / 2 * ROTARY_EMB_NUM_ELEMENTS]));
                               }
                               if constexpr (LANES_PER_HEAD < WARP_SIZE) {
                                   for (int i = 0; i < rope.size(); i++) {
@@ -114,7 +132,7 @@ public:
                                   sqrsum += float(pack[i]) * float(pack[i]);
                                   CHECK_NAN(sqrsum, "sqrsum");
                               }
-            #pragma unroll
+#pragma unroll
                               for (int mask = LANES_PER_HEAD / 2; mask > 0; mask /= 2) {
                                   sqrsum += __shfl_xor_sync(~0, sqrsum, mask);
                               }
@@ -129,15 +147,15 @@ public:
                                   CHECK_NAN(pack[i], "rms.out");
                               }

-    #if 1
+#if 1
                               // rope
                               for (int i = 0; i < PACK_SIZE; i += 2) {
-                    float2 pack2 = half22float2(half2_t(pack[i], pack[i+1]));
+                                   float2 pack2 = half22float2(half2_t(pack[i], pack[i + 1]));

                                   CHECK_NAN(freq[i].x, "rope.freq");
                                   CHECK_NAN(freq[i].y, "rope.freq");
-                    CHECK_NAN(freq[i+1].x, "rope.freq");
-                    CHECK_NAN(freq[i+1].y, "rope.freq");
+                                   CHECK_NAN(freq[i + 1].x, "rope.freq");
+                                   CHECK_NAN(freq[i + 1].y, "rope.freq");

                                   // half2_t tmp = __hmul2(freq[i], pack2);
                                   // tmp = __hfma2(freq[i+1], pack2, tmp);
@@ -164,19 +182,19 @@ public:
                                   }
                                   if constexpr (ROTARY_EMB_NUM_ELEMENTS == 2) {
                                       sin = rope[i];
-                        cos = rope[i+1];
+                                       cos = rope[i + 1];
                                   }

                                   // pack[i]   = pack2.x * freq[i].x   + pack2.y * freq[i].y;
                                   // pack[i+1] = pack2.x * freq[i+1].x + pack2.y * freq[i+1].y;

                                   pack[i]     = half_t(pack2.x * cos - pack2.y * sin);
-                    pack[i+1] = half_t(pack2.x * sin + pack2.y * cos);
+                                   pack[i + 1] = half_t(pack2.x * sin + pack2.y * cos);

                                   CHECK_NAN(pack[i], "rope.out");
-                    CHECK_NAN(pack[i+1], "rope.out");
+                                   CHECK_NAN(pack[i + 1], "rope.out");
                               }
-    #endif
+#endif

                               // mean pool
                               for (int i = 0; i < PACK_SIZE; i++) {
@@ -210,8 +228,8 @@ public:
            __syncthreads();
        }

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
            const int bm = binfo.bm;
            const int bn = binfo.bn;

@@ -223,18 +241,23 @@ public:
            assert(args.actualN == N);

            if (is_q || is_k) {
-                apply(
-                    fpsum,
+                apply(fpsum,
                      args.out + bm * BLOCK_M * args.actualN + bn * BLOCK_N,
-                    M, N, K,
+                      M,
+                      N,
+                      K,
                      args.pool_out ? args.pool_out + bm * BLOCK_M / PoolSize * N : nullptr,
                      args.rotary_emb + bm * BLOCK_M * (HEAD_DIM / 2 * ROTARY_EMB_NUM_ELEMENTS),
                      is_q ? args.rmsnorm_weight_q : args.rmsnorm_weight_k,
                      args.epsilon,
-                    args.actualM - bm * BLOCK_M
-                );
+                      args.actualM - bm * BLOCK_M);
            } else {
-                EpilogueDefault()(binfo, fpsum, M, N, K, typename EpilogueDefault::Arguments{
+                EpilogueDefault()(binfo,
+                                  fpsum,
+                                  M,
+                                  N,
+                                  K,
+                                  typename EpilogueDefault::Arguments{
                                      .out     = args.out,
                                      .actualM = args.actualM,
                                      .actualN = args.actualN,
@@ -263,13 +286,13 @@ public:
            float epsilon;
        };

-        __device__ __forceinline__
-        static rotemb_warp load_rotemb(const packed_rotemb_t *ptr_rotemb) {
+        __device__ __forceinline__ static rotemb_warp load_rotemb(const packed_rotemb_t *ptr_rotemb) {
            const int laneId = threadIdx.x % WARP_SIZE;
            const int warpId = threadIdx.x / WARP_SIZE;

            rotemb_warp rotemb;
-            const packed_rotemb_t *ptrlane = &ptr_rotemb[warpId * WARP_M_TILES * WARP_N_ROTEMB_TILES * WARP_SIZE + laneId];
+            const packed_rotemb_t *ptrlane =
+                &ptr_rotemb[warpId * WARP_M_TILES * WARP_N_ROTEMB_TILES * WARP_SIZE + laneId];

            unrolled_loop<WARP_M_TILES>([&]<int i>() {
                unrolled_loop<WARP_N_ROTEMB_TILES>([&]<int j>() {
@@ -281,8 +304,7 @@ public:
            return rotemb;
        }

-        __device__ __forceinline__
-        static void load_rmsnorm(const half_t *ptr_rmsnorm_weight, half_t *shmem) {
+        __device__ __forceinline__ static void load_rmsnorm(const half_t *ptr_rmsnorm_weight, half_t *shmem) {
            const int laneId = threadIdx.x % WARP_SIZE;

            static constexpr int PACK_SIZE = HEAD_DIM / WARP_SIZE;
@@ -292,8 +314,7 @@ public:
            store<true>(reinterpret_cast<packed_t *>(shmem + laneId * PACK_SIZE), pack);
        }

-        __device__ __forceinline__
-        static packed_fpsum_t load_rmsnorm_from_shmem(half_t *shmem, int n) {
+        __device__ __forceinline__ static packed_fpsum_t load_rmsnorm_from_shmem(half_t *shmem, int n) {
            const int laneId = threadIdx.x % WARP_SIZE;
            const int col    = n * INSN_N + laneId / 16 * 8; // lane 0-15: n*16+0, lane 16-31: n*16+8
            uint4 tmp;
@@ -301,8 +322,8 @@ public:
            return kernels::bit_cast<packed_fpsum_t>(tmp);
        }

-        __device__ __forceinline__
-        static void apply(fpsum_warp &fpsum, const packed_rotemb_t *ptr_rotemb, const half_t *ptr_rmsnorm_weight, float epsilon) {
+        __device__ __forceinline__ static void
+        apply(fpsum_warp &fpsum, const packed_rotemb_t *ptr_rotemb, const half_t *ptr_rmsnorm_weight, float epsilon) {
            const int laneId = threadIdx.x % WARP_SIZE;
            const int warpId = threadIdx.x / WARP_SIZE;

@@ -319,21 +340,21 @@ public:
                return fval.x * fval.x + fval.y * fval.y;
            };

-        #pragma unroll
+#pragma unroll
            for (int head = 0; head < NUM_HEADS_PER_WARP; head++) {
                const int n_offset = head * WARP_N_TILES_PER_HEAD;

-            #pragma unroll
+#pragma unroll
                for (int m = 0; m < WARP_M_TILES; m++) {
                    float sqrsum[2] = {0.0f, 0.0f};
-                #pragma unroll
+#pragma unroll
                    for (int n = 0; n < WARP_N_TILES_PER_HEAD; n++) {
                        sqrsum[0] += sqr(fpsum[m * WARP_N_TILES + n + n_offset].data[0]);
                        sqrsum[1] += sqr(fpsum[m * WARP_N_TILES + n + n_offset].data[1]);
                        sqrsum[0] += sqr(fpsum[m * WARP_N_TILES + n + n_offset].data[2]);
                        sqrsum[1] += sqr(fpsum[m * WARP_N_TILES + n + n_offset].data[3]);
                    }
-                #pragma unroll
+#pragma unroll
                    for (int mask = 1; mask <= 2; mask *= 2) {
                        sqrsum[0] += __shfl_xor_sync(~0, sqrsum[0], mask);
                        sqrsum[1] += __shfl_xor_sync(~0, sqrsum[1], mask);
@@ -343,14 +364,14 @@ public:
                }
            }

-        #pragma unroll
+#pragma unroll
            for (int head = 0; head < NUM_HEADS_PER_WARP; head++) {
                const int n_offset = head * WARP_N_TILES_PER_HEAD;

-            #pragma unroll
+#pragma unroll
                for (int n = 0; n < WARP_N_TILES_PER_HEAD; n++) {
                    packed_f32psum_t rms = packed_fp16_to_fp32(load_rmsnorm_from_shmem(&shmem_rmsnorm[warpId][0], n));
-            #pragma unroll
+#pragma unroll
                    for (int m = 0; m < WARP_M_TILES; m++) {
                        packed_f32psum_t pack = packed_fp16_to_fp32(fpsum[m * WARP_N_TILES + n + n_offset]);
                        pack.data[0] *= rmsnorm_coef[head][m][0] * rms.data[0];
@@ -385,8 +406,8 @@ public:
            }
        }

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
            const int bm = binfo.bm;
            const int bn = binfo.bn;

@@ -395,12 +416,10 @@ public:
            const bool is_k = !is_q && bn < binfo.numBlocksN / 3 * 2;

            if (is_q || is_k) {
-                apply(
-                    fpsum,
+                apply(fpsum,
                      args.rotary_emb + bm * NUM_WARPS * WARP_M_TILES * WARP_N_ROTEMB_TILES * WARP_SIZE,
                      is_q ? args.rmsnorm_weight_q : args.rmsnorm_weight_k,
-                    args.epsilon
-                );
+                      args.epsilon);
            }
        }
    };
@@ -424,8 +443,7 @@ public:
            int strideHead_v;
        };

-        __device__ __forceinline__ 
-        static attn_half2_t convert_half2(half2_t input) {
+        __device__ __forceinline__ static attn_half2_t convert_half2(half2_t input) {
            if constexpr (std::is_same_v<half2_t, attn_half2_t>) {
                return input;
            } else {
@@ -434,8 +452,7 @@ public:
            }
        }

-        __device__ __forceinline__
-        static packed_qkv_t pack_q(packed_fpsum_t input) {
+        __device__ __forceinline__ static packed_qkv_t pack_q(packed_fpsum_t input) {
            packed_qkv_t output;
            output.x = kernels::bit_cast<int>(convert_half2(input.data[0]));
            output.y = kernels::bit_cast<int>(convert_half2(input.data[1]));
@@ -444,8 +461,7 @@ public:
            return output;
        }

-        __device__ __forceinline__
-        static packed_qkv_t pack_k(packed_fpsum_t input) {
+        __device__ __forceinline__ static packed_qkv_t pack_k(packed_fpsum_t input) {
            packed_qkv_t output;
            output.x = kernels::bit_cast<int>(convert_half2(input.data[0]));
            output.y = kernels::bit_cast<int>(convert_half2(input.data[2]));
@@ -454,8 +470,7 @@ public:
            return output;
        }

-        __device__ __forceinline__
-        static packed_qkv_t pack_v(packed_fpsum_t input) {
+        __device__ __forceinline__ static packed_qkv_t pack_v(packed_fpsum_t input) {
            packed_qkv_t output;
            output.x = kernels::bit_cast<int>(convert_half2(movmatrix(input.data[0])));
            output.y = kernels::bit_cast<int>(convert_half2(movmatrix(input.data[1])));
@@ -464,8 +479,7 @@ public:
            return output;
        }

-        __device__ __forceinline__
-        static void mask(packed_qkv_t &pack, uint32_t maskVal, int m, int maxRows) {
+        __device__ __forceinline__ static void mask(packed_qkv_t &pack, uint32_t maskVal, int m, int maxRows) {
            const int laneId = threadIdx.x % WARP_SIZE;
            if (m * INSN_M + laneId / 4 >= maxRows) {
                pack.x = maskVal;
@@ -479,8 +493,8 @@ public:

        // qkv: [batch, head, bm, NUM_WARPS, WARP_M_TILES, WARP_N_TILES, WARP_SIZE] of packed_qkv_t
        template<typename F>
-        __device__ __forceinline__
-        static void apply(fpsum_warp &fpsum, packed_qkv_t *ptr_output, int maxRows, F &&funcPack, attn_half2_t maskVal) {
+        __device__ __forceinline__ static void
+        apply(fpsum_warp &fpsum, packed_qkv_t *ptr_output, int maxRows, F &&funcPack, attn_half2_t maskVal) {
            const int laneId = threadIdx.x % WARP_SIZE;
            const int warpId = threadIdx.x / WARP_SIZE;

@@ -497,8 +511,8 @@ public:
            });
        }

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
            const int bm = binfo.bm;
            const int bn = binfo.bn;

@@ -525,7 +539,6 @@ public:

            // static constexpr float neginf = -std::numeric_limits<float>::infinity();

-            
            if (is_q) {
                apply(fpsum, args.out_q + block_offset, maxRows, pack_q, attn_half2_t(0.0f, 0.0f));
            } else if (is_k) {
@@ -538,8 +551,8 @@ public:

    struct EpilogueLiteLA {

-        __device__ __forceinline__
-        static packed_f32psum_t mma_litela(packed_fpsum_t k, packed_fpsum_t v, packed_f32psum_t psum) {
+        __device__ __forceinline__ static packed_f32psum_t
+        mma_litela(packed_fpsum_t k, packed_fpsum_t v, packed_f32psum_t psum) {
            for (int i = 0; i < 4; i++) {
                k.data[i] = movmatrix(k.data[i]);
                v.data[i] = movmatrix(v.data[i]);
@@ -555,8 +568,8 @@ public:
        static constexpr int SHMEM_SIZE = NUM_WARPS * (LITELA_HEAD_DIM + 1) * (LITELA_HEAD_DIM + 8) * sizeof(float);

        // out_vk: [batch_size, num_heads, head_dim + 1, head_dim]
-        __device__ __forceinline__
-        static void apply_litela(const BlockInfo binfo, fpsum_warp fpsum, float *out_vk, int num_blocks_per_batch) {
+        __device__ __forceinline__ static void
+        apply_litela(const BlockInfo binfo, fpsum_warp fpsum, float *out_vk, int num_blocks_per_batch) {
            const int laneId = threadIdx.x % WARP_SIZE;
            const int warpId = threadIdx.x / WARP_SIZE;

@@ -573,8 +586,10 @@ public:
            const int batch_id  = binfo.bm / num_blocks_per_batch;

            for (int head_id = 0; head_id < WARP_N / (LITELA_HEAD_DIM * 2); head_id++) {
-                const int global_head_id = (binfo.bn - binfo.numBlocksN / 3) * (WARP_N / (LITELA_HEAD_DIM * 2)) + head_id;
-                float *out_vk_current_head = out_vk + (batch_id * num_heads + global_head_id) * (LITELA_HEAD_DIM + 1) * LITELA_HEAD_DIM;
+                const int global_head_id =
+                    (binfo.bn - binfo.numBlocksN / 3) * (WARP_N / (LITELA_HEAD_DIM * 2)) + head_id;
+                float *out_vk_current_head =
+                    out_vk + (batch_id * num_heads + global_head_id) * (LITELA_HEAD_DIM + 1) * LITELA_HEAD_DIM;

                for (int i = laneId; i < sizeof(shmem_vk) / sizeof(float) / NUM_WARPS; i += WARP_SIZE) {
                    *((&shmem_vk[warpId][0][0]) + i) = 0;
@@ -583,10 +598,11 @@ public:

                for (int tile_v = 0; tile_v < LITELA_V_TILES; tile_v++) {
                    for (int tile_k = 0; tile_k < LITELA_K_TILES; tile_k++) {
-                        packed_f32psum_t attn_sum = { 0 };
+                        packed_f32psum_t attn_sum = {0};
                        for (int i = 0; i < WARP_M_TILES; i++) {
                            packed_fpsum_t k = fpsum[i * WARP_N_TILES + head_id * (LITELA_HEAD_DIM * 2) / 16 + tile_k];
-                            packed_fpsum_t v = fpsum[i * WARP_N_TILES + head_id * (LITELA_HEAD_DIM * 2) / 16 + LITELA_HEAD_DIM / 16 + tile_v];
+                            packed_fpsum_t v = fpsum[i * WARP_N_TILES + head_id * (LITELA_HEAD_DIM * 2) / 16 +
+                                                     LITELA_HEAD_DIM / 16 + tile_v];
                            for (int j = 0; j < 4; j++) {
                                k.data[j] = __hmax2(k.data[j], half2_t(0, 0)); // relu
                            }
@@ -607,14 +623,14 @@ public:
                    }
                }
                for (int tile_k = 0; tile_k < LITELA_K_TILES; tile_k++) {
-                    packed_f32psum_t attn_sum = { 0 };
+                    packed_f32psum_t attn_sum = {0};
                    for (int i = 0; i < WARP_M_TILES; i++) {
                        packed_fpsum_t k = fpsum[i * WARP_N_TILES + head_id * (LITELA_HEAD_DIM * 2) / 16 + tile_k];
                        packed_fpsum_t v = {};
                        for (int j = 0; j < 4; j++) {
                            k.data[j] = __hmax2(k.data[j], half2_t(0, 0)); // relu
                        }
-                    #pragma unroll
+#pragma unroll
                        for (int i = 0; i < 4; i++) {
                            v.data[i] = half2_t(1, 1);
                        }
@@ -660,17 +676,19 @@ public:
            int actualM;
        };

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
            const int bm = binfo.bm;
            const int bn = binfo.bn;

            if (bn < binfo.numBlocksN / 3) {
                fpsum = apply_act(fpsum, [](half_t x) { return __hmax(x, 0); }); // relu
-                return EpilogueDefault()(
-                    binfo,
+                return EpilogueDefault()(binfo,
                                         fpsum,
-                    M, N / 3, K, typename EpilogueDefault::Arguments{
+                                         M,
+                                         N / 3,
+                                         K,
+                                         typename EpilogueDefault::Arguments{
                                             .out     = args.out_q,
                                             .actualM = args.actualM,
                                             .actualN = N / 3,
@@ -686,8 +704,7 @@ public:
        struct vk_mul_q_kernel {
            static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
            // FIXME FIXME FIXME
-            __device__
-            void operator()(half_t *q, const float *vk, float eps, int num_tokens) {
+            __device__ void operator()(half_t *q, const float *vk, float eps, int num_tokens) {
                const int block_id = blockIdx.x;
                const int head_id  = blockIdx.y;
                const int batch_id = blockIdx.z;
@@ -698,9 +715,12 @@ public:

                bool pred = block_id * block_size + threadIdx.x < num_tokens;

-                half_t *localq = &q[(((batch_id * num_blocks + block_id) * block_size + threadIdx.x) * num_heads + head_id) * LITELA_HEAD_DIM];
+                half_t *localq =
+                    &q[(((batch_id * num_blocks + block_id) * block_size + threadIdx.x) * num_heads + head_id) *
+                       LITELA_HEAD_DIM];
                const float *localvk = &vk[(batch_id * num_heads + head_id) * (LITELA_HEAD_DIM + 1) * LITELA_HEAD_DIM];
-                // half_t *localout = &out[(((batch_id * num_blocks + block_id) * block_size + threadIdx.x) * num_heads + head_id) * LITELA_HEAD_DIM];
+                // half_t *localout = &out[(((batch_id * num_blocks + block_id) * block_size + threadIdx.x) * num_heads
+                // + head_id) * LITELA_HEAD_DIM];

                using packed_q  = std::array<half_t, 8>;
                using packed_vk = std::array<float, 4>;
@@ -708,18 +728,19 @@ public:
                half_t qblock[LITELA_HEAD_DIM];
                for (int i = 0; i < LITELA_HEAD_DIM; i += sizeof(packed_q) / sizeof(half_t)) {
                    if (pred) {
-                        *reinterpret_cast<packed_q *>(&qblock[i]) = load(reinterpret_cast<const packed_q *>(&localq[i]));
+                        *reinterpret_cast<packed_q *>(&qblock[i]) =
+                            load(reinterpret_cast<const packed_q *>(&localq[i]));
                    }
                }

                float outblock[LITELA_HEAD_DIM + 1];
-            #pragma unroll
+#pragma unroll
                for (int j = 0; j < LITELA_HEAD_DIM + 1; j++) {
                    outblock[j] = 0;
-            #pragma unroll
+#pragma unroll
                    for (int i = 0; i < LITELA_HEAD_DIM; i += sizeof(packed_vk) / sizeof(float)) {
                        packed_vk vkpack = load(reinterpret_cast<const packed_vk *>(&localvk[j * LITELA_HEAD_DIM + i]));
-            #pragma unroll
+#pragma unroll
                        for (int k = 0; k < vkpack.size(); k++) {
                            outblock[j] += (float)qblock[i + k] * vkpack[k];
                        }
@@ -739,11 +760,11 @@ public:
        };
    };

-
    template<typename Epilogue>
    struct test_epilogue_kernel {
        static constexpr int MIN_ARCH = std::is_same_v<half_t, __nv_bfloat16> ? 800 : 750;
-        static constexpr size_t SHMEM_PER_WARP = ceilDiv<size_t>(Base::template load_act_to_fpsum<false>::SHMEM_SIZE, 128) * 128;
+        static constexpr size_t SHMEM_PER_WARP =
+            ceilDiv<size_t>(Base::template load_act_to_fpsum<false>::SHMEM_SIZE, 128) * 128;
        static constexpr size_t SHMEM_SIZE = SHMEM_PER_WARP * NUM_WARPS;

        struct Arguments {
@@ -757,9 +778,7 @@ public:
            typename Epilogue::Arguments argsEpilogue;
        };

-        __device__ __forceinline__
-        void operator()(Arguments args) 
-        {
+        __device__ __forceinline__ void operator()(Arguments args) {
            const BlockInfo binfo = {
                .bm         = (int)blockIdx.x,
                .bn         = (int)blockIdx.y,
@@ -778,26 +797,27 @@ public:

            fpsum_warp fpsum;

-            Base::template load_act_to_fpsum<false>()(
-                args.input + m_offset * args.actualN + n_offset,
+            Base::template load_act_to_fpsum<false>()(args.input + m_offset * args.actualN + n_offset,
                                                      args.actualN,
                                                      args.actualM - m_offset,
                                                      args.actualN - n_offset,
                                                      fpsum,
-                shmem + warpId * SHMEM_PER_WARP
-            );
+                                                      shmem + warpId * SHMEM_PER_WARP);

            Epilogue()(binfo, fpsum, args.M, args.N, 0, args.argsEpilogue);

-            EpilogueDefault()(binfo, fpsum, args.M, args.N, 0, typename EpilogueDefault::Arguments{
+            EpilogueDefault()(binfo,
+                              fpsum,
+                              args.M,
+                              args.N,
+                              0,
+                              typename EpilogueDefault::Arguments{
                                  .out     = args.output,
                                  .actualM = args.actualM,
                                  .actualN = args.actualN,
                              });
        }
    };
-
 };

-
 }; // namespace nunchaku::kernels
--- a/src/kernels/zgemm/gemm_base.cuh
+++ b/src/kernels/zgemm/gemm_base.cuh
@@ -7,7 +7,6 @@
 #include "gemm_utils.cuh"
 #include "mma_earlycuda.cuh"

-
 #pragma nv_diag_suppress 177

 #ifdef _MSC_VER
@@ -122,7 +121,8 @@ public:
     *  ...
     *  230 231 238 239 246 247 254 255 <-- load by lane 31, broadcast to lane {3, 7, 11, ..., 31} (8x)
     *
-     * {k}-th wscale used by lane {i} => {k // (WSCALES_PACK_SIZE * WARP_SIZE)}-th pack, in lane {4*(k // WSCALES_PACK_SIZE) + i % 4}, element {k % WSCALES_PACK_SIZE}
+     * {k}-th wscale used by lane {i} => {k // (WSCALES_PACK_SIZE * WARP_SIZE)}-th pack, in lane {4*(k //
+     * WSCALES_PACK_SIZE) + i % 4}, element {k % WSCALES_PACK_SIZE}
     *
     * max pack size set to 8 since max load size is 16 bytes / lane
     * min pack size set to 2 since shuffle granularity is 32b 2*half
@@ -146,7 +146,8 @@ public:
     *  54  62
     *  55  63  <-- load by lane 31, broadcast to lane {28, 29, 30, 31}  (4x)
     *
-     * {k}-th wscale used by lane {i} => {k // (ASCALES_PACK_SIZE * WARP_SIZE)}-th pack, in lane {8*(k // ASCALES_PACK_SIZE) + i // 4}, element {k % ASCALES_PACK_SIZE}
+     * {k}-th wscale used by lane {i} => {k // (ASCALES_PACK_SIZE * WARP_SIZE)}-th pack, in lane {8*(k //
+     * ASCALES_PACK_SIZE) + i // 4}, element {k % ASCALES_PACK_SIZE}
     */
    static constexpr int ASCALES_PACK_SIZE   = clamp(WARP_M / WARP_SIZE, 4 / sizeof(half), 16 / sizeof(half));
    static constexpr int ASCALES_NUM_PACKS   = ceilDiv(WARP_M, (ASCALES_PACK_SIZE * WARP_SIZE));
@@ -192,7 +193,6 @@ public:
    using f32psum_warp     = std::array<packed_f32psum_t, WARP_M_TILES * WARP_N_TILES>;
    using gated_fpsum_warp = std::array<packed_gated_fpsum_t, WARP_M_TILES * WARP_N_TILES>;

-
    struct BlockInfo {
        int bm;
        int bn;
@@ -200,8 +200,8 @@ public:
        int numBlocksN;
    };

-    __device__ __forceinline__
-    static packed_f32psum_t mma_f16xf16_f32(packed_fpsum_t a, packed_fpsum_t b, packed_f32psum_t psum) {
+    __device__ __forceinline__ static packed_f32psum_t
+    mma_f16xf16_f32(packed_fpsum_t a, packed_fpsum_t b, packed_f32psum_t psum) {
        static_assert(std::is_same_v<half_t, half> || std::is_same_v<half_t, __nv_bfloat16>);

        static constexpr bool is_bf16 = std::is_same_v<half_t, __nv_bfloat16>;
@@ -226,8 +226,7 @@ public:
        return psum;
    }

-    __device__ __forceinline__
-    static packed_fpsum_t packed_fp32_to_fp16(packed_f32psum_t input) {
+    __device__ __forceinline__ static packed_fpsum_t packed_fp32_to_fp16(packed_f32psum_t input) {
        packed_fpsum_t results;
        for (int i = 0; i < 4; i++) {
            results.data[i] = float22half2<half2_t>(float2(input.data[i * 2], input.data[i * 2 + 1]));
@@ -235,8 +234,7 @@ public:
        return results;
    }

-    __device__ __forceinline__
-    static packed_f32psum_t packed_fp16_to_fp32(packed_fpsum_t input) {
+    __device__ __forceinline__ static packed_f32psum_t packed_fp16_to_fp32(packed_fpsum_t input) {
        packed_f32psum_t results;
        for (int i = 0; i < 4; i++) {
            float2 tmp              = half22float2(input.data[i]);
@@ -246,20 +244,18 @@ public:
        return results;
    }

-    __device__ __forceinline__
-    static fpsum_warp packed_fp32_to_fp16(f32psum_warp input) {
+    __device__ __forceinline__ static fpsum_warp packed_fp32_to_fp16(f32psum_warp input) {
        fpsum_warp results;
-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < results.size(); i++) {
            results[i] = packed_fp32_to_fp16(input[i]);
        }
        return results;
    }

-    __device__ __forceinline__
-    static f32psum_warp packed_fp16_to_fp32(fpsum_warp input) {
+    __device__ __forceinline__ static f32psum_warp packed_fp16_to_fp32(fpsum_warp input) {
        f32psum_warp results;
-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < results.size(); i++) {
            results[i] = packed_fp16_to_fp32(input[i]);
        }
@@ -267,13 +263,12 @@ public:
    }

    // activation: row major, [M / BLOCK_M, K / WARP_K, NUM_WARPS, WARP_M_TILES, WARP_SIZE] of packed_act_t
-    __device__ __forceinline__ 
-    static void load_act(const packed_act_t *act, int k, int K, act_warp &out, bool pred) {
+    __device__ __forceinline__ static void load_act(const packed_act_t *act, int k, int K, act_warp &out, bool pred) {
        int laneId = threadIdx.x % WARP_SIZE;
        int warpId = threadIdx.x / WARP_SIZE;
-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < WARP_M_TILES; i++) {
-            //if (pred) {
+            // if (pred) {
            //  out[i] = load(&act[((warpId * WARP_M_TILES + i) * K / WARP_K + k) * WARP_SIZE + laneId]);
            out[i] = load_pred(&act[((k * NUM_WARPS + warpId) * WARP_M_TILES + i) * WARP_SIZE + laneId], pred);
            //}
@@ -281,16 +276,15 @@ public:
    }

    // weight: column major: [N / BLOCK_N, 1, K / WARP_K, WARP_N_TILES, WARP_SIZE] of packed_wgt_t
-    __device__ __forceinline__ 
-    static void load_wgt(const packed_wgt_t *wgt, int k, int K, wgt_warp &out, bool pred) {
+    __device__ __forceinline__ static void load_wgt(const packed_wgt_t *wgt, int k, int K, wgt_warp &out, bool pred) {
        int laneId = threadIdx.x % WARP_SIZE;

        // const packed_wgt_t *ptr = &wgt[(0 * K / WARP_K + k) * WARP_SIZE + laneId];
        const packed_wgt_t *ptr = &wgt[(0 + k * WARP_N_TILES) * WARP_SIZE + laneId];
        // int offset = K / WARP_K * WARP_SIZE;
-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < WARP_N_TILES; i++) {
-            //if (pred) {
+            // if (pred) {
            //  out[i] = load(&wgt[(i * K / WARP_K + k) * WARP_SIZE + laneId]);
            //  out[i] = load(&wgt[(i + k * WARP_N_TILES) * WARP_SIZE + laneId]);
            out[i] = load_pred(&ptr[i * WARP_SIZE], pred);
@@ -299,55 +293,61 @@ public:
        }
    }

-    // ascales: row major [M / BLOCK_M, K / group size, NUM_WARPS, ASCALES_NUM_PACKS, ASCALES_VALID_LANES] of packed_ascale_t
-    __device__ __forceinline__ 
-    static void load_ascale(const packed_ascale_t *ascales, int group, int M, ascale_warp &out, bool pred) {
+    // ascales: row major [M / BLOCK_M, K / group size, NUM_WARPS, ASCALES_NUM_PACKS, ASCALES_VALID_LANES] of
+    // packed_ascale_t
+    __device__ __forceinline__ static void
+    load_ascale(const packed_ascale_t *ascales, int group, int M, ascale_warp &out, bool pred) {
        int laneId = threadIdx.x % WARP_SIZE;
        int warpId = threadIdx.x / WARP_SIZE;
-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < ASCALES_NUM_PACKS; i++) {
            // if (pred && laneId < ASCALES_VALID_LANES) {
-                // out[i] = ascales[(group * M / WARP_M + warpId) * ASCALES_VALID_LANES * ASCALES_NUM_PACKS + i * ASCALES_VALID_LANES + laneId];
-                out[i] = load_pred(&ascales[(group * NUM_WARPS + warpId) * ASCALES_NUM_PACKS * ASCALES_VALID_LANES + i * ASCALES_VALID_LANES + laneId], pred && laneId < ASCALES_VALID_LANES);
+            // out[i] = ascales[(group * M / WARP_M + warpId) * ASCALES_VALID_LANES * ASCALES_NUM_PACKS + i *
+            // ASCALES_VALID_LANES + laneId];
+            out[i] = load_pred(&ascales[(group * NUM_WARPS + warpId) * ASCALES_NUM_PACKS * ASCALES_VALID_LANES +
+                                        i * ASCALES_VALID_LANES + laneId],
+                               pred && laneId < ASCALES_VALID_LANES);

            // }
        }
    }

-    // wscales: column major [N / BLOCK_N, K / group size, 1, WSCALES_NUM_PACKS, WSCALES_VALID_LANES] of packed_wscale_t </del>
-    __device__ __forceinline__
-    static void load_wscale(const packed_wscale_t *wscales, int group, int N, wscale_warp &out, bool pred) {
+    // wscales: column major [N / BLOCK_N, K / group size, 1, WSCALES_NUM_PACKS, WSCALES_VALID_LANES] of packed_wscale_t
+    // </del>
+    __device__ __forceinline__ static void
+    load_wscale(const packed_wscale_t *wscales, int group, int N, wscale_warp &out, bool pred) {
        int laneId = threadIdx.x % WARP_SIZE;

        // static_assert(WSCALES_NUM_PACKS * WSCALES_VALID_LANES == 32);
        // static_assert(sizeof(packed_wscale_t) == 8);

        // const packed_wscale_t *ptr = &wscales[(group * WSCALES_NUM_PACKS + 0) * WSCALES_VALID_LANES + laneId];
-        // // const packed_wscale_t *ptr = (const packed_wscale_t *)((const char *)wscales) + ((group * WSCALES_NUM_PACKS + 0) * WSCALES_VALID_LANES + laneId) * sizeof(packed_wscale_t);
+        // // const packed_wscale_t *ptr = (const packed_wscale_t *)((const char *)wscales) + ((group *
+        // WSCALES_NUM_PACKS + 0) * WSCALES_VALID_LANES + laneId) * sizeof(packed_wscale_t);

-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < WSCALES_NUM_PACKS; i++) {
            // if (pred && laneId < WSCALES_VALID_LANES) {

-                // out[i] = wscales[group * N / WARP_N * WSCALES_VALID_LANES * WSCALES_NUM_PACKS + i * WSCALES_VALID_LANES + laneId];
-                // out[i] = load(&wscales[group * N / WARP_N * WSCALES_VALID_LANES * WSCALES_NUM_PACKS + i * WSCALES_VALID_LANES + laneId]);
-                out[i] = load_pred(&wscales[(group * WSCALES_NUM_PACKS + i) * WSCALES_VALID_LANES + laneId], pred && laneId < WSCALES_VALID_LANES);
+            // out[i] = wscales[group * N / WARP_N * WSCALES_VALID_LANES * WSCALES_NUM_PACKS + i * WSCALES_VALID_LANES +
+            // laneId]; out[i] = load(&wscales[group * N / WARP_N * WSCALES_VALID_LANES * WSCALES_NUM_PACKS + i *
+            // WSCALES_VALID_LANES + laneId]);
+            out[i] = load_pred(&wscales[(group * WSCALES_NUM_PACKS + i) * WSCALES_VALID_LANES + laneId],
+                               pred && laneId < WSCALES_VALID_LANES);
            // out[i] = load(&ptr[i * WSCALES_VALID_LANES]);
            // }
        }
    }

    // get {k}-th and {k+1}-th wscale from the block, k must be multiples of 2, k must be uniform across all lanes
-    __device__ __forceinline__
-    static half2_t broadcast_wscale(wscale_warp block, int k, int laneId) {
+    __device__ __forceinline__ static half2_t broadcast_wscale(wscale_warp block, int k, int laneId) {
        const int packIdx    = k / (WSCALES_PACK_SIZE * WARP_SIZE);
        const int srcLane    = 4 * (k / WSCALES_PACK_SIZE) + laneId % 4;
        const int elementIdx = k % WSCALES_PACK_SIZE / 2;
        return __shfl_sync(~0, block[packIdx].data[elementIdx], srcLane);
    }
    // get {k}-th and {k+1}-th ascale from the block, k must be multiples of 2, k must be uniform across all lanes
-    __device__ __forceinline__
-    static half2_t broadcast_ascale(ascale_warp block, int k, int laneId) {
+    __device__ __forceinline__ static half2_t broadcast_ascale(ascale_warp block, int k, int laneId) {
        const int packIdx    = k / (ASCALES_PACK_SIZE * WARP_SIZE);
        const int srcLane    = 8 * (k / ASCALES_PACK_SIZE) + laneId / 4;
        const int elementIdx = k % ASCALES_PACK_SIZE / 2;
@@ -355,20 +355,18 @@ public:
    }

    struct i2f_normal {
-        __device__ __forceinline__
-        static float2 int2float2(int x, int y) {
+        __device__ __forceinline__ static float2 int2float2(int x, int y) {
            return make_float2(__int2float_rn(x), __int2float_rn(y));
        }

-        __device__ __forceinline__
-        static half2_t int2half2(int x, int y) {
+        __device__ __forceinline__ static half2_t int2half2(int x, int y) {
            return float22half2<half2_t>(int2float2(x, y));
        }
    };

    template<typename i2f = i2f_normal, typename F>
-    __device__ __forceinline__
-    static void apply_scales(F &&getpsum, ascale_warp ascale, wscale_warp wscale, fpsum_warp &fpsum) {
+    __device__ __forceinline__ static void
+    apply_scales(F &&getpsum, ascale_warp ascale, wscale_warp wscale, fpsum_warp &fpsum) {
        const int laneId = threadIdx.x % WARP_SIZE;
        const int warpId = threadIdx.x / WARP_SIZE;

@@ -393,7 +391,8 @@ public:
                // constexpr int target = 0;
                // if (threadIdx.x == 3 && j == 1 && i == 0) {

-                //     printf("before ws2 = %f %f fsum.data[%d] = %f %f\n", (float)ws2.x, (float)ws2.y, target, (float)fsum.data[target].x, (float)fsum.data[target].y);
+                //     printf("before ws2 = %f %f fsum.data[%d] = %f %f\n", (float)ws2.x, (float)ws2.y, target,
+                //     (float)fsum.data[target].x, (float)fsum.data[target].y);
                // }

                fsum.data[0] = __hfma2(i2f::int2half2(psum.data[0], psum.data[1]), __hmul2(asx[i], ws1), fsum.data[0]);
@@ -402,15 +401,16 @@ public:
                fsum.data[3] = __hfma2(i2f::int2half2(psum.data[6], psum.data[7]), __hmul2(asy[i], ws2), fsum.data[3]);

                // if (threadIdx.x == 3 && j == 1 && i == 0) {
-                //     printf("before ws2 = %f %f fsum.data[%d] = %f %f\n", (float)ws2.x, (float)ws2.y, target, (float)fsum.data[target].x, (float)fsum.data[target].y);
+                //     printf("before ws2 = %f %f fsum.data[%d] = %f %f\n", (float)ws2.x, (float)ws2.y, target,
+                //     (float)fsum.data[target].x, (float)fsum.data[target].y);
                // }
            }
        }
    }

    template<typename i2f = i2f_normal, typename F>
-    __device__ __forceinline__
-    static void apply_scales(F &&getpsum, ascale_warp ascale, wscale_warp wscale, f32psum_warp &fpsum) {
+    __device__ __forceinline__ static void
+    apply_scales(F &&getpsum, ascale_warp ascale, wscale_warp wscale, f32psum_warp &fpsum) {
        const int laneId = threadIdx.x % WARP_SIZE;
        const int warpId = threadIdx.x / WARP_SIZE;

@@ -449,17 +449,18 @@ public:
     * input: WARP_M of half (in shared memory, per-warp)
     * output: [..., ASCALES_NUM_PACKS, ASCALES_VALID_LANES] in global memory, per-warp
     */
-    __device__ __forceinline__
-    static void pack_ascales(const half_t *input, packed_ascale_t *output) {
+    __device__ __forceinline__ static void pack_ascales(const half_t *input, packed_ascale_t *output) {
        const int laneId = threadIdx.x % WARP_SIZE;
-    #pragma unroll
+#pragma unroll
        for (int j = 0; j < ASCALES_NUM_PACKS; j++) {
            if (laneId < ASCALES_VALID_LANES) {
                packed_ascale_t tmp;
-    #pragma unroll
+#pragma unroll
                for (int i = 0; i < ASCALES_PACK_SIZE; i += 2) {
-                    tmp.data[i / 2].x = input[j * ASCALES_PACK_SIZE * WARP_SIZE + laneId / 8 * 8 * ASCALES_PACK_SIZE + laneId % 8 + i * 8];
-                    tmp.data[i / 2].y = input[j * ASCALES_PACK_SIZE * WARP_SIZE + laneId / 8 * 8 * ASCALES_PACK_SIZE + laneId % 8 + (i + 1) * 8];
+                    tmp.data[i / 2].x = input[j * ASCALES_PACK_SIZE * WARP_SIZE + laneId / 8 * 8 * ASCALES_PACK_SIZE +
+                                              laneId % 8 + i * 8];
+                    tmp.data[i / 2].y = input[j * ASCALES_PACK_SIZE * WARP_SIZE + laneId / 8 * 8 * ASCALES_PACK_SIZE +
+                                              laneId % 8 + (i + 1) * 8];
                }
                output[j * ASCALES_VALID_LANES + laneId] = tmp;
            }
@@ -470,16 +471,17 @@ public:
     * input: WARP_N of half (in shared memory, per-warp)
     * output: [..., WSCALES_NUM_PACKS, WSCALES_VALID_LANES] in global memory, per-warp
     */
-    __device__ __forceinline__
-    static void pack_wscales(const half_t *input, packed_wscale_t *output) {
+    __device__ __forceinline__ static void pack_wscales(const half_t *input, packed_wscale_t *output) {
        const int laneId = threadIdx.x % WARP_SIZE;
-    #pragma unroll
+#pragma unroll
        for (int j = 0; j < WSCALES_NUM_PACKS; j++) {
            if (laneId < WSCALES_VALID_LANES) {
                packed_wscale_t tmp;
-    #pragma unroll
+#pragma unroll
                for (int i = 0; i < WSCALES_PACK_SIZE; i += 2) {
-                    tmp.data[i / 2] = *reinterpret_cast<const half2_t *>(&input[j * WSCALES_PACK_SIZE * WARP_SIZE + laneId / 4 * 4 * WSCALES_PACK_SIZE + laneId % 4 * 2 + i * 4]);
+                    tmp.data[i / 2] = *reinterpret_cast<const half2_t *>(
+                        &input[j * WSCALES_PACK_SIZE * WARP_SIZE + laneId / 4 * 4 * WSCALES_PACK_SIZE + laneId % 4 * 2 +
+                               i * 4]);
                }
                store(&output[j * WSCALES_VALID_LANES + laneId], tmp);
            }
@@ -495,14 +497,13 @@ public:
        using pack_t                    = std::array<half_t, PACK_SIZE>;

        // F (int rowId, pack_t &pack)
-        template<typename ...F>
-        __device__ __forceinline__
-        void operator()(fpsum_warp fpsum, half_t *output, int stride, int maxRows, int maxCols, void *shmem, F &&...plugins) {
+        template<typename... F>
+        __device__ __forceinline__ void operator()(
+            fpsum_warp fpsum, half_t *output, int stride, int maxRows, int maxCols, void *shmem, F &&...plugins) {
            const int laneId = threadIdx.x % WARP_SIZE;

            matrix_t &mat = *reinterpret_cast<matrix_t *>(shmem);

-
            // pack_t reduce_tmp;
            // constexpr bool enableReduce = !std::is_void_v<FuncReduce>;

@@ -518,9 +519,9 @@ public:
            //     }
            // };

-            #pragma unroll
+#pragma unroll
            for (int i = 0; i < WARP_M_TILES; i++) {
-            #pragma unroll
+#pragma unroll
                for (int j = 0; j < WARP_N_TILES; j++) {
                    packed_fpsum_t &fsum                             = fpsum[i * WARP_N_TILES + j];
                    int row                                          = laneId / 4;
@@ -530,7 +531,7 @@ public:
                }
                __syncwarp();

-            #pragma unroll
+#pragma unroll
                for (int row = 0; row < 8; row++) {
                    pack_t pack = *reinterpret_cast<pack_t *>(&mat[row][laneId * PACK_SIZE]);

@@ -542,12 +543,14 @@ public:

                    bool pred = i * INSN_M + row < maxRows && laneId * PACK_SIZE < maxCols;
                    // if (pred) {
-                    store_pred(reinterpret_cast<pack_t *>(&output[(i * INSN_M + row) * stride + laneId * PACK_SIZE]), pack, pred);
+                    store_pred(reinterpret_cast<pack_t *>(&output[(i * INSN_M + row) * stride + laneId * PACK_SIZE]),
+                               pack,
+                               pred);
                    // }
                }
                __syncwarp();

-            #pragma unroll
+#pragma unroll
                for (int j = 0; j < WARP_N_TILES; j++) {
                    packed_fpsum_t &fsum                             = fpsum[i * WARP_N_TILES + j];
                    int row                                          = laneId / 4;
@@ -557,7 +560,7 @@ public:
                }
                __syncwarp();

-            #pragma unroll
+#pragma unroll
                for (int row = 0; row < 8; row++) {
                    pack_t pack = *reinterpret_cast<pack_t *>(&mat[row][laneId * PACK_SIZE]);

@@ -569,7 +572,10 @@ public:

                    bool pred = i * INSN_M + 8 + row < maxRows && laneId * PACK_SIZE < maxCols;
                    // if (pred) {
-                    store_pred(reinterpret_cast<pack_t *>(&output[(i * INSN_M + 8 + row) * stride + laneId * PACK_SIZE]), pack, pred);
+                    store_pred(
+                        reinterpret_cast<pack_t *>(&output[(i * INSN_M + 8 + row) * stride + laneId * PACK_SIZE]),
+                        pack,
+                        pred);
                    // }
                }
                __syncwarp();
@@ -587,8 +593,8 @@ public:
        using matrix_t                     = half_t[INSN_M][WARP_N + 8];
        static constexpr size_t SHMEM_SIZE = sizeof(matrix_t);

-        __device__ __forceinline__
-        void operator()(const half_t *input, int stride, int maxRows, int maxCols, fpsum_warp &out, void *shmem) {
+        __device__ __forceinline__ void
+        operator()(const half_t *input, int stride, int maxRows, int maxCols, fpsum_warp &out, void *shmem) {
            const int laneId = threadIdx.x % WARP_SIZE;

            matrix_t &mat = *reinterpret_cast<matrix_t *>(shmem);
@@ -597,9 +603,9 @@ public:
            using packed_input      = std::array<half_t, PACK_SIZE>;
            using packed_raw_input  = std::array<half2_t, PACK_SIZE>;

-        #pragma unroll
+#pragma unroll
            for (int m = 0; m < WARP_M_TILES; m++) {
-        #pragma unroll
+#pragma unroll
                for (int row = 0; row < INSN_M; row++) {
                    packed_input pack;
                    // TODO: numCols not multiples of PACK_SIZE
@@ -608,9 +614,10 @@ public:
                        raw.fill(half2_t(0, 0));
                        bool pred = (m * INSN_M + row) < maxRows && laneId * PACK_SIZE * 2 < maxCols;
                        if (pred) {
-                            raw = load(reinterpret_cast<const packed_raw_input *>(input + (m * INSN_M + row) * stride + laneId * PACK_SIZE * 2));
+                            raw = load(reinterpret_cast<const packed_raw_input *>(input + (m * INSN_M + row) * stride +
+                                                                                  laneId * PACK_SIZE * 2));
                        }
-                    #pragma unroll
+#pragma unroll
                        for (int j = 0; j < PACK_SIZE; j++) {
                            pack[j] = raw[j].x * silu(raw[j].y);
                        }
@@ -618,7 +625,8 @@ public:
                        pack.fill(half_t(0));
                        bool pred = (m * INSN_M + row) < maxRows && laneId * PACK_SIZE < maxCols;
                        if (pred) {
-                            pack = load(reinterpret_cast<const packed_input *>(input + (m * INSN_M + row) * stride + laneId * PACK_SIZE));
+                            pack = load(reinterpret_cast<const packed_input *>(input + (m * INSN_M + row) * stride +
+                                                                               laneId * PACK_SIZE));
                        }
                    }
                    store<true>(reinterpret_cast<packed_input *>(&mat[row][laneId * PACK_SIZE]), pack);
@@ -637,16 +645,14 @@ public:
        }
    };

-    
    template<typename F>
-    __device__ __forceinline__
-    static fpsum_warp apply_act(fpsum_warp fpsum, F func) {
+    __device__ __forceinline__ static fpsum_warp apply_act(fpsum_warp fpsum, F func) {
        fpsum_warp result;
-    #pragma unroll
+#pragma unroll
        for (int i = 0; i < WARP_M_TILES; i++) {
-    #pragma unroll
+#pragma unroll
            for (int j = 0; j < WARP_N_TILES; j++) {
-    #pragma unroll
+#pragma unroll
                for (int k = 0; k < 4; k++) {
                    half2_t &dst = result[i * WARP_N_TILES + j].data[k];
                    half2_t src  = fpsum[i * WARP_N_TILES + j].data[k];
@@ -658,15 +664,14 @@ public:
        return result;
    }

-
    struct EpilogueDefault {
        struct Arguments {
            half_t *out;
            int actualM, actualN;
        };

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
            const int warpId = threadIdx.x / WARP_SIZE;

            __shared__ alignas(128) uint8_t shmem[NUM_WARPS][ceilDiv(unpack_fpsum::SHMEM_SIZE, 128) * 128];
@@ -674,8 +679,7 @@ public:
            const int m_offset = binfo.bm * BLOCK_M + warpId * WARP_M;
            const int n_offset = binfo.bn * BLOCK_N;

-            unpack_fpsum()(
-                fpsum, 
+            unpack_fpsum()(fpsum,
                           args.out + m_offset * args.actualN + n_offset,
                           args.actualN,
                           args.actualM - m_offset,
@@ -683,24 +687,24 @@ public:
                           shmem[warpId],
                           [&](int rowId, unpack_fpsum::pack_t &pack) ALWAYSINLINE {
                               if constexpr (std::is_same_v<half_t, half>) {
-                    #pragma unroll
+#pragma unroll
                                   for (int i = 0; i < pack.size(); i++) {
                                       pack[i] = __hmin(pack[i], (half)65504);
                                       pack[i] = __hmax(pack[i], (half)-65504);
                                   }
                               }
-                }
-            );
+                           });
        }
    };

    struct EpilogueNop {
        // workaround for layout mismatch between host and device code
-        struct Arguments { size_t unused; };
+        struct Arguments {
+            size_t unused;
+        };

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {
-        }
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp fpsum, int M, int N, int K, const Arguments &args) {}
    };

    template<bool USE_BIAS = true, bool USE_SCALE = false>
@@ -710,12 +714,13 @@ public:
            const packed_wscale_t *scale;
        };

-        __device__ __forceinline__
-        void apply_bias(fpsum_warp &fpsum, int M, int N, int K, const packed_wscale_t *bias, const packed_wscale_t *scale) {
+        __device__ __forceinline__ void
+        apply_bias(fpsum_warp &fpsum, int M, int N, int K, const packed_wscale_t *bias, const packed_wscale_t *scale) {
            const int laneId = threadIdx.x % WARP_SIZE;

            // if (laneId == 0) {
-            //     printf("block.x=%d block.y=%d warpId=%d bias=%p\n", blockIdx.x, blockIdx.y, threadIdx.x / WARP_SIZE, bias);
+            //     printf("block.x=%d block.y=%d warpId=%d bias=%p\n", blockIdx.x, blockIdx.y, threadIdx.x / WARP_SIZE,
+            //     bias);
            // }

            wscale_warp b, s;
@@ -738,7 +743,6 @@ public:
                    s2 = broadcast_wscale(s, j * 4 + 2, laneId);
                }

-
                for (int i = 0; i < WARP_M_TILES; i++) {
                    auto &fsum = fpsum[i * WARP_N_TILES + j];

@@ -762,51 +766,51 @@ public:
            }
        }

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
            const int bn = binfo.bn;
            if constexpr (USE_BIAS || USE_SCALE) {
-                apply_bias(
-                    fpsum, M, N, K,
+                apply_bias(fpsum,
+                           M,
+                           N,
+                           K,
                           args.bias + bn * WSCALES_NUM_PACKS * WSCALES_VALID_LANES,
-                    args.scale + bn * WSCALES_NUM_PACKS * WSCALES_VALID_LANES
-                );
+                           args.scale + bn * WSCALES_NUM_PACKS * WSCALES_VALID_LANES);
            }
        }
    };

    struct EpilogueSilu {
-        struct Arguments { size_t unused; };
+        struct Arguments {
+            size_t unused;
+        };

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
            fpsum = apply_act(fpsum, [](half_t x) { return silu(x); });
        }
    };

-    template<typename ...Epilogues>
+    template<typename... Epilogues>
    struct EpilogueCombination {
        using Arguments = std::tuple<typename Epilogues::Arguments...>;

-        __device__ __forceinline__
-        void operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
+        __device__ __forceinline__ void
+        operator()(const BlockInfo binfo, fpsum_warp &fpsum, int M, int N, int K, const Arguments &args) {
            // this function makes intellisense crashes :(
-    #if __INTELLISENSE__
+#if __INTELLISENSE__
            __trap(); // should not happen when actually compiling
-    #else
+#else
            std::tuple<Epilogues...> epilogues;
            auto run = [&]<size_t idx>() {
                std::get<idx>(epilogues).operator()(binfo, fpsum, M, N, K, std::get<idx>(args));
            };
-            auto foreach = [&]<size_t ...Is>(std::index_sequence<Is...>) {
-                (run.template operator()<Is>(), ...);
-            };
-            foreach(std::make_index_sequence<sizeof...(Epilogues)>());
-    #endif
+            auto foreach = [&]<size_t... Is>(std::index_sequence<Is...>) { (run.template operator()<Is>(), ...); };
+            foreach (std::make_index_sequence<sizeof...(Epilogues)>())
+                ;
+#endif
        }
    };
-
-
 };

 #define IMPORT_GEMM_BASE(config)                                                                                       \
@@ -869,7 +873,7 @@ public:

 template<typename kernel>
 constexpr int min_arch() {
-    if constexpr (requires {kernel::MIN_ARCH;}) {
+    if constexpr (requires { kernel::MIN_ARCH; }) {
        return kernel::MIN_ARCH;
    } else {
        return 0;
@@ -877,16 +881,15 @@ constexpr int min_arch() {
 }
 template<typename kernel>
 constexpr int max_arch() {
-    if constexpr (requires {kernel::MAX_ARCH;}) {
+    if constexpr (requires { kernel::MAX_ARCH; }) {
        return kernel::MAX_ARCH;
    } else {
        return INT_MAX;
    }
 }

-template<typename kernel, typename ...T>
-__global__
-static void invoke_kernel(T ...args) {
+template<typename kernel, typename... T>
+__global__ static void invoke_kernel(T... args) {
 #ifdef __CUDA_ARCH__
    if constexpr (__CUDA_ARCH__ >= min_arch<kernel>() && __CUDA_ARCH__ <= max_arch<kernel>()) {
        kernel()(args...);
@@ -900,8 +903,7 @@ static void invoke_kernel(T ...args) {
 }

 template<typename T>
-__global__
-static void test_sizeof_device() {
+__global__ static void test_sizeof_device() {
    printf("sizeof on device = %d\n", (int)sizeof(T));
 }