Merge branch 'develop' into gemm_getname

67ab3896 · Aleksander Dudek · 8adaf418 · d5c8a334 · 67ab3896 · 67ab3896
Commit 67ab3896 authored Jan 08, 2025 by Aleksander Dudek
20 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_v2.hpp
@@ -77,6 +77,43 @@ struct DeviceGemmV2R1 : public BaseOperator
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename BScaleType,
+          typename CDataType,
+          index_t ScaleBlockN,
+          index_t ScaleBlockK,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmV2BScale : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t StrideScaleB,
+                        const void* p_b_scale,
+                        ck::index_t KSplit,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    virtual bool GetPermuteB()         = 0;
+    virtual ck::index_t GetKPerBlock() = 0;
+};
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -469,7 +469,11 @@ struct DeviceGemm_Xdl_CShuffle_Streamk_V3 : public DeviceGemm_Streamk_V2<ALayout
        {
            return false;
        }
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> &&
+           arg.Streamk_sel > 0)
+        {
+            return false;
+        }
        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
                                                       GemmSpec == GemmSpecialization::NKPadding ||
                                                       GemmSpec == GemmSpecialization::MNKPadding ||

--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -44,6 +44,40 @@ __host__ __device__ inline half4_t pki4_to_half4(int q)
    return res.template AsType<half4_t>()[Number<0>{}];
 }
+__host__ __device__ inline half4_t pki4_to_half4_scale(int q, const ck::half2_t& scale)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+    // Extract the two int4 at low bit and create two fp16 number.
+    int lo = amd_assembly_and_or_b32(q, LO, EX);
+    // Extract the two int4 at hight bit and create two fp16 number.
+    int hi = amd_assembly_and_or_b32(q, HI, EX);
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+    vector_type<half_t, 4> res;
+    res.template AsType<half2_t>()(Number<0>{}) =
+        amd_assembly_pk_add_f16(bit_cast<half2_t>(lo), bit_cast<half2_t>(SUB));
+    res.template AsType<half2_t>()(Number<1>{}) = amd_assembly_pk_fma_f16(
+        bit_cast<half2_t>(hi), bit_cast<half2_t>(MUL), bit_cast<half2_t>(ADD));
+    asm volatile("v_pk_mul_f16 %0, %1, %2"
+                 : "=v"(res.template AsType<half2_t>()(Number<0>{}))
+                 : "v"(res.template AsType<half2_t>()(Number<0>{})), "v"(scale));
+    asm volatile("v_pk_mul_f16 %0, %1, %2"
+                 : "=v"(res.template AsType<half2_t>()(Number<1>{}))
+                 : "v"(res.template AsType<half2_t>()(Number<1>{})), "v"(scale));
+    return res.template AsType<half4_t>()[Number<0>{}];
+}
 __host__ __device__ inline half2_t pki4_to_half2(pk_i4_t q)
 {
 #if 1
@@ -171,7 +205,42 @@ struct PassThroughPack8
        dst.template AsType<bhalf2_t>()(Number<3>{}) =
            pki4_to_bhalf2(src.template AsType<pk_i4_t>()[Number<3>{}]);
-        y          = dst.template AsType<bhalf8_t>()[Number<0>{}];
+        y = dst.template AsType<bhalf8_t>()[Number<0>{}];
+#endif
+    }
+    constexpr const static bool is_pack8_invocable = true;
+};
+struct DequantPack8
+{
+    template <typename Y, typename X, typename Z>
+    __host__ __device__ void operator()(Y& y, const X& x, const Z& z) const;
+    __host__ __device__ constexpr void
+    operator()(ck::half8_t& y, const ck::pk_i4x4_t& x, const ck::half2_t& z) const
+    {
+#if 1
+        vector_type<half_t, 8> result;
+        result.template AsType<half4_t>()(Number<0>{}) = pki4_to_half4_scale(bit_cast<int>(x), z);
+        result.template AsType<half4_t>()(Number<1>{}) =
+            pki4_to_half4_scale(bit_cast<int>(x) >> 8, z);
+        y = result.template AsType<half8_t>()[Number<0>{}];
+#else
+        vector_type<half_t, 8> dst;
+        vector_type<pk_i4_t, 4> src{x};
+        dst.template AsType<half2_t>()(Number<0>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<0>{}]);
+        dst.template AsType<half2_t>()(Number<1>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<1>{}]);
+        dst.template AsType<half2_t>()(Number<2>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<2>{}]);
+        dst.template AsType<half2_t>()(Number<3>{}) =
+            pki4_to_half2(src.template AsType<pk_i4_t>()[Number<3>{}]);
+        y          = dst.template AsType<half8_t>()[Number<0>{}];
 #endif
    }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -1222,6 +1222,206 @@ struct ThreadwiseTensorSliceTransfer_v4
        });
    }
+    // Fuse scale
+    template <typename SrcRefToOriginDisplacement,
+              typename DstOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcRefToOriginDisplacement&,
+                        const SrcBuffer& src_buf,
+                        const DstData& scale,
+                        const DstDesc&,
+                        const DstOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value &&
+                is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+        static_assert(is_known_at_compile_time<remove_cvref_t<SrcRefToOriginDisplacement>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<DstOriginIdx>>::value,
+                      "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
+                      "at compile-time");
+        // SrcDesc and DstDesc are known at compile-time
+        constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
+        constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
+        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_origin_idx             = to_multi_index(DstOriginIdx{});
+        // scalar per access of each dim
+        constexpr auto src_scalar_per_access = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<SrcScalarPerVector>{};
+                }
+                else
+                {
+                    return Number<1>{};
+                }
+            },
+            Number<nDim>{});
+        // scalar step (if steping on SrcVectorDim) of each dim
+        constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<1>{};
+                }
+                else
+                {
+                    return Number<0>{};
+                }
+            },
+            Number<nDim>{});
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+        constexpr auto dim_access_order = DimAccessOrder{};
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+#if 0
+            // TODO: unable to compile
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
+                src_scalar_per_access;
+#else
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
+#endif
+            // src coordinate
+            constexpr auto src_ref_to_data_disp_idx =
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
+            auto src_data_coord = src_ref_coord_;
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
+            vector_type_maker_t<SrcData, SrcScalarPerVector / PackedSize> src_tmp_vector;
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                src_desc, src_data_coord);
+            // copy data from src_buf into src_tmp_vector
+            if constexpr(SrcBuffer::IsDynamicBuffer())
+            {
+                src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                    src_buf.template Get<src_vector_t>(src_data_coord.GetOffset() / PackedSize,
+                                                       is_src_valid);
+            }
+            else if constexpr(SrcBuffer::IsStaticBuffer())
+            {
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t src_offset = src_desc.CalculateOffset(
+                        src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
+                        i * src_scalar_step_in_vector);
+                    src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
+                });
+            }
+            if constexpr(is_same<remove_cvref_t<SrcData>, pk_i4_t>::value)
+            {
+                // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+                // DstData)
+                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+                vector_type<DstData, 2> scale_vector;
+                scale_vector.template AsType<DstData>()(Number<0>{}) = scale;
+                scale_vector.template AsType<DstData>()(Number<1>{}) = scale;
+                constexpr index_t pack_size = 8;
+                static_assert(SrcScalarPerVector % pack_size == 0, "");
+                using src_v_t = typename vector_type_maker_t<SrcData, pack_size / PackedSize>::type;
+                using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
+                using scale_v_t = typename vector_type_maker_t<DstData, 2>::type;
+                static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
+                    ck::tensor_operation::element_wise::DequantPack8{}(
+                        dst_tmp_vector.template AsType<dst_v_t>()(i),
+                        src_tmp_vector.template AsType<src_v_t>()[i],
+                        scale_vector.template AsType<scale_v_t>()[Number<0>{}]);
+                });
+                // copy data from dst_tmp_vector into dst_buf
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+                });
+            }
+            else if constexpr(is_same<remove_cvref_t<SrcData>, f8_t>::value &&
+                              is_same<remove_cvref_t<DstData>, half_t>::value &&
+                              SrcScalarPerVector % 2 == 0)
+            {
+                // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+                // DstData)
+                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+                constexpr index_t pack_size = 2;
+                using dst_v_t = typename vector_type_maker_t<DstData, pack_size>::type;
+                using src_v_t = typename vector_type_maker_t<SrcData, pack_size>::type;
+                static_for<0, SrcScalarPerVector / pack_size, 1>{}([&](auto i) {
+                    ck::tensor_operation::element_wise::PassThroughPack2{}(
+                        dst_tmp_vector.template AsType<dst_v_t>()(i),
+                        src_tmp_vector.template AsType<src_v_t>()[i]);
+                });
+                // copy data from dst_tmp_vector into dst_buf
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+                });
+            }
+            else
+            {
+                // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+                // DstData)
+                vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+                // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    dst_tmp_vector.template AsType<DstData>()(i) =
+                        type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
+                });
+                // copy data from dst_tmp_vector into dst_buf
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                        dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+                    dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+                });
+            }
+        });
+    }
    template <typename SrcSliceMoveStepIdx>
    __device__ void MoveSrcSliceWindow(const SrcDesc&,
                                       const SrcSliceMoveStepIdx& src_slice_move_step_idx)
@@ -1344,7 +1544,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic
    ElementwiseOperation element_op_;
 };
-// Specilized for WMMA-Navi3
+// Specialized for gfx11
 // A single Wave32 is composed by double row
 // Data exchange allowed between these two rows
 // This RowLane Dst buf will be filled from two Src buf
@@ -1479,7 +1679,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
    ElementwiseOperation element_op_{};
 };
-// Specilized for WMMA-Navi4
+// Specialized for gfx12
 template <typename SrcData,
          typename DstData,
          typename SrcDesc,

--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -307,7 +307,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
    // Wave mode dependent propety
    static constexpr index_t wave_size = Number<WaveSize>{};
-    // * Fixed in Navi3x, Will be wave mode dependent on Navi4x
+    // * Fixed for gfx11, Will be wave mode dependent on gfx12
    // static constexpr index_t num_src_a_vgprs_per_wave = k_per_wmma / 2 * src_a_data_size / 4;
    // static constexpr index_t num_src_b_vgprs_per_wave = k_per_wmma / 2 * src_b_data_size / 4;
    // * num_acc_vgprs_per_wave alone M direction

--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -4,8 +4,8 @@
 #ifndef CK_AMD_INLINE_ASM_HPP
 #define CK_AMD_INLINE_ASM_HPP
-#include "data_type.hpp"
 #include "c_style_pointer_cast.hpp"
+#include "data_type.hpp"
 // TODO: deprecate all amd_assembly_outer_product_xxx
@@ -21,14 +21,14 @@ inline __device__ int amd_assembly_and_or_b32(int a, int b, int d)
 inline __device__ half2_t amd_assembly_pk_fma_f16(half2_t a, half2_t b, half2_t c)
 {
    half2_t d;
-    asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
+    asm volatile("v_pk_fma_f16 %0, %1, %2, %3" : "=v"(d) : "v"(a), "v"(b), "v"(c));
    return d;
 }
 inline __device__ half2_t amd_assembly_pk_add_f16(half2_t a, half2_t b)
 {
    half2_t c;
-    asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
+    asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(c) : "v"(a), "v"(b));
    return c;
 }

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -19,6 +19,8 @@ struct pk_i4_t
    type data;
    __host__ __device__ constexpr pk_i4_t() : data{type{}} {}
    __host__ __device__ constexpr pk_i4_t(type init) : data{init} {}
+    __host__ __device__ constexpr operator float() const { return static_cast<int8_t>(data); }
 };
 inline constexpr auto next_pow2(uint32_t x)

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
@@ -465,6 +465,19 @@ inline __host__ __device__ float2_t type_convert<float2_t, f8x2_ocp_t>(f8x2_ocp_
 #endif
 }
+template <>
+inline __host__ __device__ float2_t type_convert<float2_t, pk_i4_t>(pk_i4_t x)
+{
+    uint8_t x_u8 = ck::bit_cast<uint8_t>(x);
+    uint8_t x_l  = (x_u8 & 0x0f) >> 0;
+    uint8_t x_h  = (x_u8 & 0xf0) >> 4;
+    auto l_f32 = ck::type_convert<float>(x_l);
+    auto h_f32 = ck::type_convert<float>(x_h);
+    return {l_f32, h_f32};
+}
 template <>
 inline __host__ __device__ half2_t type_convert<half2_t, float2_t>(float2_t x)
 {

--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -47,10 +47,16 @@ struct FmhaFwdSplitKVKernel
    static constexpr bool kStoreLSE         = FmhaPipeline::kStoreLSE;
    static constexpr bool kDoFp8StaticQuant = FmhaPipeline::Problem::kDoFp8StaticQuant;
    static constexpr bool kIsPagedKV        = FmhaPipeline::Problem::kIsPagedKV;
+    static constexpr bool kMergeNumHeadGroupsSeqLenQ =
+        FmhaPipeline::Problem::kMergeNumHeadGroupsSeqLenQ;
    using FmhaMask                 = ck_tile::remove_cvref_t<typename FmhaPipeline::FmhaMask>;
    static constexpr bool kHasMask = FmhaMask::IsMasking;
+    static_assert(!kMergeNumHeadGroupsSeqLenQ ||
+                  (kMergeNumHeadGroupsSeqLenQ && BiasEnum == BlockAttentionBiasEnum::NO_BIAS &&
+                   !kHasMask));
    // clang-format off
    template <typename T> struct t2s;
    template <> struct t2s<float> { static constexpr const char * name = "fp32"; };
@@ -476,15 +482,20 @@ struct FmhaFwdSplitKVKernel
    }
    CK_TILE_HOST static constexpr auto GridSize(ck_tile::index_t batch_size,
-                                                ck_tile::index_t nhead,
+                                                ck_tile::index_t nhead_q,
+                                                ck_tile::index_t nhead_kv,
                                                ck_tile::index_t max_seqlen_q,
                                                ck_tile::index_t hdim_v,
                                                ck_tile::index_t num_splits)
    {
+        ck_tile::index_t nhead_ = kMergeNumHeadGroupsSeqLenQ ? nhead_kv : nhead_q;
+        ck_tile::index_t max_seqlen_q_ =
+            max_seqlen_q * (kMergeNumHeadGroupsSeqLenQ ? nhead_q / nhead_kv : 1);
        // TODO: this may need tuning
-        return dim3(ck_tile::integer_divide_ceil(max_seqlen_q, FmhaPipeline::kM0) *
+        return dim3(ck_tile::integer_divide_ceil(max_seqlen_q_, FmhaPipeline::kM0) *
                        ck_tile::integer_divide_ceil(hdim_v, FmhaPipeline::kN1) * num_splits,
-                    nhead,
+                    nhead_,
                    batch_size);
    }
@@ -562,7 +573,7 @@ struct FmhaFwdSplitKVKernel
            // # of required blocks is different in each groups, terminate unnecessary blocks
            // earlier
-            if(kargs.seqlen_q <= i_m0)
+            if(kargs.seqlen_q * (kMergeNumHeadGroupsSeqLenQ ? kargs.nhead_ratio_qk : 1) <= i_m0)
            {
                return;
            }
@@ -617,30 +628,60 @@ struct FmhaFwdSplitKVKernel
        }
        // for simplicity, batch stride we just modify the pointer
+        const index_t i_nhead_k =
+            (kMergeNumHeadGroupsSeqLenQ ? i_nhead : i_nhead / kargs.nhead_ratio_qk);
        const QDataType* q_ptr = reinterpret_cast<const QDataType*>(kargs.q_ptr) +
-                                 static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_q +
+                                 static_cast<long_index_t>(i_nhead) *
+                                     (kMergeNumHeadGroupsSeqLenQ ? kargs.nhead_ratio_qk : 1) *
+                                     kargs.nhead_stride_q +
                                 batch_offset_q;
-        const KDataType* k_ptr =
+        const KDataType* k_ptr = reinterpret_cast<const KDataType*>(kargs.k_ptr) +
-            reinterpret_cast<const KDataType*>(kargs.k_ptr) +
+                                 static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_k +
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_k +
+                                 batch_offset_k;
-            batch_offset_k;
+        const VDataType* v_ptr = reinterpret_cast<const VDataType*>(kargs.v_ptr) +
-        const VDataType* v_ptr =
+                                 static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_v +
-            reinterpret_cast<const VDataType*>(kargs.v_ptr) +
+                                 batch_offset_v;
-            static_cast<long_index_t>(i_nhead / kargs.nhead_ratio_qk) * kargs.nhead_stride_v +
-            batch_offset_v;
        ODataType* o_acc_ptr = reinterpret_cast<ODataType*>(kargs.o_acc_ptr) +
-                               static_cast<long_index_t>(i_nhead) * kargs.nhead_stride_o_acc +
+                               static_cast<long_index_t>(i_nhead) *
+                                   (kMergeNumHeadGroupsSeqLenQ ? kargs.nhead_ratio_qk : 1) *
+                                   kargs.nhead_stride_o_acc +
                               batch_offset_o_acc + i_split * kargs.split_stride_o_acc;
        // Q/K/V DRAM and DRAM window
-        const auto q_dram = [&]() {
+        const auto q_dram = [&] {
-            const auto q_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+            const auto q_dram_naive = [&] {
-                q_ptr,
+                if constexpr(kMergeNumHeadGroupsSeqLenQ)
-                make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                {
-                make_tuple(kargs.stride_q, 1),
+                    // reshape: (nhead_ratio_qk, seqlen_q, hdim_q) -> (nhead_ratio_qk * seqlen_q,
-                number<FmhaPipeline::kAlignmentQ>{},
+                    // hdim_q)
-                number<1>{});
+                    const auto view = make_naive_tensor_view<address_space_enum::global>(
+                        q_ptr,
+                        make_tuple(kargs.nhead_ratio_qk, kargs.seqlen_q, kargs.hdim_q),
+                        make_tuple(kargs.nhead_stride_q, kargs.stride_q, 1),
+                        number<FmhaPipeline::kAlignmentQ>{},
+                        number<1>{});
+                    return transform_tensor_view(
+                        view,
+                        make_tuple(
+                            make_merge_transform(make_tuple(kargs.nhead_ratio_qk, kargs.seqlen_q)),
+                            make_pass_through_transform(kargs.hdim_q)),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        q_ptr,
+                        make_tuple(kargs.seqlen_q, kargs.hdim_q),
+                        make_tuple(kargs.stride_q, 1),
+                        number<FmhaPipeline::kAlignmentQ>{},
+                        number<1>{});
+                }
+            }();
            if constexpr(FmhaPipeline::kQLoadOnce)
            {
                return pad_tensor_view(
@@ -729,7 +770,7 @@ struct FmhaFwdSplitKVKernel
            }
        }();
-        auto k_page_block_navigator = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+        auto k_page_block_navigator = [&, i_batch_ = i_batch]() {
            if constexpr(kIsPagedKV)
            {
                const auto* block_indices =
@@ -739,8 +780,7 @@ struct FmhaFwdSplitKVKernel
                    integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size);
                const long_index_t fixed_offset =
-                    static_cast<long_index_t>(i_nhead_ / kargs.nhead_ratio_qk) *
+                    static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_k;
-                    kargs.nhead_stride_k;
                return make_page_block_navigator<const KDataType, 0>(
                    kargs.k_ptr,
@@ -760,7 +800,7 @@ struct FmhaFwdSplitKVKernel
            }
        }();
-        auto v_page_block_navigator = [&, i_batch_ = i_batch, i_nhead_ = i_nhead]() {
+        auto v_page_block_navigator = [&, i_batch_ = i_batch]() {
            if constexpr(kIsPagedKV)
            {
                const auto* block_indices =
@@ -770,8 +810,7 @@ struct FmhaFwdSplitKVKernel
                    integer_divide_ceil(kv_l2p_offset + kargs.seqlen_k, kargs.page_block_size);
                const long_index_t fixed_offset =
-                    static_cast<long_index_t>(i_nhead_ / kargs.nhead_ratio_qk) *
+                    static_cast<long_index_t>(i_nhead_k) * kargs.nhead_stride_v;
-                    kargs.nhead_stride_v;
                return make_page_block_navigator<const VDataType, 1>(
                    kargs.v_ptr,
@@ -842,19 +881,40 @@ struct FmhaFwdSplitKVKernel
        // lse acc
        auto lse_acc_dram_window = [&, i_nhead_ = i_nhead, i_split_ = i_split]() {
            constexpr auto lse_acc_dram_window_lengths = make_tuple(number<FmhaPipeline::kM0>{});
-            LSEDataType* lse_acc_ptr =
+            LSEDataType* lse_acc_ptr = reinterpret_cast<LSEDataType*>(kargs.lse_acc_ptr) +
-                reinterpret_cast<LSEDataType*>(kargs.lse_acc_ptr) +
+                                       static_cast<long_index_t>(i_nhead_) *
-                static_cast<long_index_t>(i_nhead_) * kargs.nhead_stride_lse_acc +
+                                           (kMergeNumHeadGroupsSeqLenQ ? kargs.nhead_ratio_qk : 1) *
-                batch_offset_lse_acc + i_split_ * kargs.split_stride_lse_acc;
+                                           kargs.nhead_stride_lse_acc +
+                                       batch_offset_lse_acc + i_split_ * kargs.split_stride_lse_acc;
-            const auto lse_acc_dram = [&]() {
-                const auto lse_acc_dram_naive =
+            const auto lse_acc_dram = [&] {
-                    make_naive_tensor_view<address_space_enum::global>(lse_acc_ptr,
+                const auto lse_acc_dram_naive = [&] {
-                                                                       make_tuple(kargs.seqlen_q),
+                    if constexpr(kMergeNumHeadGroupsSeqLenQ)
-                                                                       make_tuple(1),
+                    {
-                                                                       number<1>{},
+                        // reshape: (nhead_ratio_qk, seqlen_q) -> (nhead_ratio_qk * seqlen_q)
-                                                                       number<1>{});
+                        const auto view = make_naive_tensor_view<address_space_enum::global>(
+                            lse_acc_ptr,
+                            make_tuple(kargs.nhead_ratio_qk, kargs.seqlen_q),
+                            make_tuple(kargs.nhead_stride_lse_acc, 1),
+                            number<1>{},
+                            number<1>{});
+                        return transform_tensor_view(view,
+                                                     make_tuple(make_merge_transform(make_tuple(
+                                                         kargs.nhead_ratio_qk, kargs.seqlen_q))),
+                                                     make_tuple(sequence<0, 1>{}),
+                                                     make_tuple(sequence<0>{}));
+                    }
+                    else
+                    {
+                        return make_naive_tensor_view<address_space_enum::global>(
+                            lse_acc_ptr,
+                            make_tuple(kargs.seqlen_q),
+                            make_tuple(1),
+                            number<1>{},
+                            number<1>{});
+                    }
+                }();
                return pad_tensor_view(
                    lse_acc_dram_naive, lse_acc_dram_window_lengths, sequence<kPadSeqLenQ>{});
            }();
@@ -953,13 +1013,37 @@ struct FmhaFwdSplitKVKernel
        }();
        // Oacc DRAM and Oacc DRAM window
-        auto o_acc_dram = [&]() {
+        auto o_acc_dram = [&] {
-            const auto o_acc_dram_naive = make_naive_tensor_view<address_space_enum::global>(
+            const auto o_acc_dram_naive = [&] {
-                o_acc_ptr,
+                if constexpr(kMergeNumHeadGroupsSeqLenQ)
-                make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                {
-                make_tuple(kargs.stride_o_acc, 1),
+                    // reshape: (nhead_ratio_qk, seqlen_q, hdim_v) -> (nhead_ratio_qk * seqlen_q,
-                number<FmhaPipeline::kAlignmentOacc>{},
+                    // hdim_v)
-                number<1>{});
+                    const auto view = make_naive_tensor_view<address_space_enum::global>(
+                        o_acc_ptr,
+                        make_tuple(kargs.nhead_ratio_qk, kargs.seqlen_q, kargs.hdim_v),
+                        make_tuple(kargs.nhead_stride_o_acc, kargs.stride_o_acc, 1),
+                        number<FmhaPipeline::kAlignmentOacc>{},
+                        number<1>{});
+                    return transform_tensor_view(
+                        view,
+                        make_tuple(
+                            make_merge_transform(make_tuple(kargs.nhead_ratio_qk, kargs.seqlen_q)),
+                            make_pass_through_transform(kargs.hdim_v)),
+                        make_tuple(sequence<0, 1>{}, sequence<2>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                }
+                else
+                {
+                    return make_naive_tensor_view<address_space_enum::global>(
+                        o_acc_ptr,
+                        make_tuple(kargs.seqlen_q, kargs.hdim_v),
+                        make_tuple(kargs.stride_o_acc, 1),
+                        number<FmhaPipeline::kAlignmentOacc>{},
+                        number<1>{});
+                }
+            }();
            return pad_tensor_view(
                o_acc_dram_naive,

--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_problem.hpp
@@ -94,16 +94,17 @@ struct BlockFmhaFwdSplitKVPipelineProblem
    static constexpr bool kIsGroupMode = kIsGroupMode_;
    // attributes from traits
-    static constexpr bool kPadSeqLenQ       = Traits::kPadSeqLenQ;
+    static constexpr bool kPadSeqLenQ                = Traits::kPadSeqLenQ;
-    static constexpr bool kPadSeqLenK       = Traits::kPadSeqLenK;
+    static constexpr bool kPadSeqLenK                = Traits::kPadSeqLenK;
-    static constexpr bool kPadHeadDimQ      = Traits::kPadHeadDimQ;
+    static constexpr bool kPadHeadDimQ               = Traits::kPadHeadDimQ;
-    static constexpr bool kPadHeadDimV      = Traits::kPadHeadDimV;
+    static constexpr bool kPadHeadDimV               = Traits::kPadHeadDimV;
-    static constexpr auto BiasEnum          = Traits::BiasEnum;
+    static constexpr auto BiasEnum                   = Traits::BiasEnum;
-    static constexpr bool kStoreLSE         = Traits::kStoreLSE;
+    static constexpr bool kStoreLSE                  = Traits::kStoreLSE;
-    static constexpr bool kDoFp8StaticQuant = Traits::kDoFp8StaticQuant;
+    static constexpr bool kDoFp8StaticQuant          = Traits::kDoFp8StaticQuant;
-    static constexpr bool kIsPagedKV        = Traits::kIsPagedKV;
+    static constexpr bool kIsPagedKV                 = Traits::kIsPagedKV;
-    static constexpr bool kHasUnevenSplits  = kIsGroupMode || Traits::kHasUnevenSplits;
+    static constexpr bool kHasUnevenSplits           = kIsGroupMode || Traits::kHasUnevenSplits;
-    static constexpr index_t kBlockPerCu    = Traits::kBlockPerCu;
+    static constexpr bool kMergeNumHeadGroupsSeqLenQ = Traits::kMergeNumHeadGroupsSeqLenQ;
+    static constexpr index_t kBlockPerCu             = Traits::kBlockPerCu;
 };
 // extract tile size attributes to remove dependency on traits

--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_traits.hpp
@@ -43,7 +43,8 @@ template <bool kPadSeqLenQ_ /* padding for seqlen_q */,
          bool kDoFp8StaticQuant_,
          bool kIsPagedKV_,
          bool kHasUnevenSplits_,
-          index_t kBlockPerCu_ = -1 /* overwrite occupancy if not -1 */>
+          bool kMergeNumHeadGroupsSeqLenQ_ = false,
+          index_t kBlockPerCu_             = -1 /* overwrite occupancy if not -1 */>
 struct TileFmhaFwdSplitKVTraits
 {
    static constexpr bool kPadSeqLenQ       = kPadSeqLenQ_;
@@ -56,8 +57,9 @@ struct TileFmhaFwdSplitKVTraits
    static constexpr bool kDoFp8StaticQuant = kDoFp8StaticQuant_;
    static constexpr bool kIsPagedKV        = kIsPagedKV_;
    // determine if some split (length) is not divisible by tile size
-    static constexpr bool kHasUnevenSplits = kHasUnevenSplits_;
+    static constexpr bool kHasUnevenSplits           = kHasUnevenSplits_;
-    static constexpr index_t kBlockPerCu   = kBlockPerCu_;
+    static constexpr bool kMergeNumHeadGroupsSeqLenQ = kMergeNumHeadGroupsSeqLenQ_;
+    static constexpr index_t kBlockPerCu             = kBlockPerCu_;
 };
 template <bool kPadSeqLenQ_ /* padding for seqlen_q */,

--- a/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+++ b/include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
@@ -15,6 +15,7 @@ struct Layernorm2dFwdHostArgs
    const void* p_x;          // [m ,n], input, fp16/bf16
    const void* p_x_residual; // [m ,n], shortcut input, prec same as input, nullptr if not used
    const void* p_x_scale;    // [1 ,n], smooth scale input, fp32, nullptr if not used
+    const void* p_x_bias;     // [1, n], bias, prec same as input
    const void* p_gamma;      // [1, n], gamma, prec same as input
    const void* p_beta;       // [1, n], beta, prec same as input
@@ -43,6 +44,7 @@ struct Layernorm2dFwd
    using Problem  = typename Pipeline::Problem;
    using XDataType       = remove_cvref_t<typename Problem::XDataType>;
+    using XBiasDataType   = remove_cvref_t<typename Problem::XBiasDataType>;
    using GammaDataType   = remove_cvref_t<typename Problem::GammaDataType>;
    using BetaDataType    = remove_cvref_t<typename Problem::BetaDataType>;
    using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
@@ -67,6 +69,7 @@ struct Layernorm2dFwd
    static constexpr bool kPadM       = false; // always no need to pad along M
    static constexpr bool kPadN       = Problem::Traits::kPadN;
    static constexpr bool kTwoPass    = Problem::Traits::kTwoPass;
+    static constexpr auto kXbias      = Problem::Traits::kXbias;
    static constexpr auto kFusedAdd   = Problem::Traits::kFusedAdd;
    static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant;
@@ -82,6 +85,7 @@ struct Layernorm2dFwd
        const void* p_x;          // [m ,n], input, fp16/bf16
        const void* p_x_residual; // [m ,n], shortcut input, prec same as input, nullptr if not used
        const void* p_x_scale;    // [1 ,n], smooth scale input, fp32, nullptr if not used
+        const void* p_x_bias;     // [1, n], bias, prec same as input
        const void* p_gamma;      // [1, n], gamma, prec same as input
        const void* p_beta;       // [1, n], beta, prec same as input
@@ -108,6 +112,7 @@ struct Layernorm2dFwd
        return Kargs{hargs.p_x,
                     hargs.p_x_residual,
                     hargs.p_x_scale,
+                     hargs.p_x_bias,
                     hargs.p_gamma,
                     hargs.p_beta,
                     hargs.p_y,
@@ -152,6 +157,7 @@ struct Layernorm2dFwd
        using S_ = typename Problem::BlockShape;
        auto surfix = [&] () {
            std::string n;
+            if (kXbias != Layernorm2dXBiasEnum::NO_BIAS) n += _SS_("_") + Layernorm2dXBiasEnumName<kXbias>::name;
            if (kFusedAdd != Layernorm2dFusedAddEnum::NO_ADD) n += _SS_("_") + Layernorm2dFusedAddEnumName<kFusedAdd>::name;
            if (kFusedQuant != Layernorm2dFusedQuantEnum::NO_SWEEP) n += _SS_("_") + Layernorm2dFusedQuantEnumName<kFusedQuant>::name;
            if (kPadN) n += "_pn";
@@ -228,6 +234,27 @@ struct Layernorm2dFwd
            }
        }();
+        const auto x_bias_window = [&]() {
+            if constexpr(kXbias == Layernorm2dXBiasEnum::ADD_BIAS)
+            {
+                const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
+                    static_cast<const XBiasDataType*>(kargs.p_x_bias),
+                    make_tuple(kargs.n),
+                    make_tuple(1),
+                    number<Vector_N>{},
+                    number<1>{});
+                const auto tmp2_ =
+                    pad_tensor_view(tmp_, make_tuple(number<Block_N>{}), sequence<false>{});
+                return make_tile_window(tmp2_, make_tuple(number<Block_N>{}), {0});
+            }
+            else
+            {
+                return make_null_tile_window(make_tuple(number<Block_N>{}));
+            }
+        }();
        const auto gamma_window = [&]() {
            const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
                static_cast<const GammaDataType*>(kargs.p_gamma),
@@ -371,6 +398,7 @@ struct Layernorm2dFwd
        Pipeline{}(x_window,
                   x_residual_window,
+                   x_bias_window,
                   gamma_window,
                   beta_window,
                   y_window,

--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_default_policy.hpp
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp
--- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
+++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
@@ -8,6 +8,7 @@
 namespace ck_tile {
 template <typename XDataType_,
+          typename XBiasDataType_,
          typename GammaDataType_,
          typename BetaDataType_,
          typename ComputeDataType_,
@@ -21,6 +22,7 @@ template <typename XDataType_,
 struct Layernorm2dFwdPipelineProblem
 {
    using XDataType       = remove_cvref_t<XDataType_>;
+    using XBiasDataType   = remove_cvref_t<XBiasDataType_>;
    using GammaDataType   = remove_cvref_t<GammaDataType_>;
    using BetaDataType    = remove_cvref_t<BetaDataType_>;
    using ComputeDataType = remove_cvref_t<ComputeDataType_>;