CK Tile FA Training kernels (#1286)

* FA fwd dropout * FA bwd * epilogue reuse * CMakeLists update * [CK_TILE] support alibi (#1269) * add alibi support * fix code * update code based on comment * Support more hdim * fix fp8 bias * support seqlen_k=0 case * remove unused printf * fix format --------- Co-authored-by: rocking <ChunYu.Lai@amd.com> * now fwd/bwd can build * bwd alibi * add bwd validation stream_config * update generated filenames * update bwd kernel launch * CK_TILE_HOST_DEVICE in philox * Transpose -> transpose * format * format * format * Generate the instance for FA required * format * fix error in WarpGemm --------- Co-authored-by: danyao12 <danyao12> Co-authored-by: carlushuang <carlus.huang@amd.com> Co-authored-by: rocking <ChunYu.Lai@amd.com> Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com> Co-authored-by: Jing Zhang <jizhan@amd.com>

CK Tile FA Training kernels (#1286)
* FA fwd dropout * FA bwd * epilogue reuse * CMakeLists update * [CK_TILE] support alibi (#1269) * add alibi support * fix code * update code based on comment * Support more hdim * fix fp8 bias * support seqlen_k=0 case * remove unused printf * fix format --------- Co-authored-by: rocking <ChunYu.Lai@amd.com> * now fwd/bwd can build * bwd alibi * add bwd validation stream_config * update generated filenames * update bwd kernel launch * CK_TILE_HOST_DEVICE in philox * Transpose -> transpose * format * format * format * Generate the instance for FA required * format * fix error in WarpGemm --------- Co-authored-by: danyao12 <danyao12> Co-authored-by: carlushuang <carlus.huang@amd.com> Co-authored-by: rocking <ChunYu.Lai@amd.com> Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com> Co-authored-by: Jing Zhang <jizhan@amd.com>
2cab8d39 · Dan Yao · GitHub · 76827d82 · 2cab8d39 · 2cab8d39
Unverified Commit 2cab8d39 authored Jun 05, 2024 by Dan Yao Committed by GitHub Jun 04, 2024
10 changed files
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp"
+namespace ck_tile {
+// A is block window on shared memory
+// B is block distributed tensor
+// C is block distributed tensor
+template <typename Problem_, typename Policy_ = BlockGemmASmemBRegCRegV1DefaultPolicy>
+struct BlockGemmASmemBRegCRegV1
+{
+    using Problem        = remove_cvref_t<Problem_>;
+    using Policy         = remove_cvref_t<Policy_>;
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockWindowTmp, typename BBlockTensorTmp>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ABlockWindowTmp& a_block_window_tmp,
+                                   const BBlockTensorTmp& b_block_tensor_tmp) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cv_t<typename ABlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cv_t<typename BBlockTensorTmp::DataType>> &&
+                std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
+            "wrong!");
+        // constexpr index_t MPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<0>{}];
+        // constexpr index_t NPerBlock = BBlockTensorTmp{}.get_lengths()[number<0>{}];
+        // constexpr index_t KPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<1>{}];
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+        constexpr index_t KPerBlock = BlockGemmShape::kK;
+        // static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN &&
+        //                   KPerBlock == BlockGemmShape::kK,
+        //               "wrong!");
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+        constexpr index_t MPerBlockPerIter = MPerBlock / MIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+        const index_t iMWarp = get_warp_id() / NWarp;
+        constexpr auto b_block_outer_dstr_encoding =
+            tile_distribution_encoding<sequence<MWarp>,
+                                       tuple<sequence<NIterPerWarp, NWarp>, sequence<KIterPerWarp>>,
+                                       tuple<sequence<0, 1>>,
+                                       tuple<sequence<0, 1>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 0>>{};
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+        constexpr auto b_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            b_block_outer_dstr_encoding, typename WG::BWarpDstrEncoding{});
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+        constexpr auto b_block_dstr = make_static_tile_distribution(b_block_dstr_encode);
+        // constrcut from B-block-tensor from B-Block-tensor-tmp
+        // FIXME: need method to check b_block_tensor and b_block_tensor_tmp have equivalent
+        // distribution
+        auto b_block_tensor =
+            make_static_distributed_tensor<typename BBlockTensorTmp::DataType>(b_block_dstr);
+        b_block_tensor.get_thread_buffer() = b_block_tensor_tmp.get_thread_buffer();
+        // construct A-warp-window
+        auto a_warp_window_tmp = make_tile_window(
+            a_block_window_tmp.get_bottom_tensor_view(),
+            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+            a_block_window_tmp.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+#if 0 // FIXME: using array will cause register spill
+        array<array<decltype(b_warp_window_tmp), KIterPerWarp>, NIterPerWarp> b_warp_windows{
+            {b_warp_window_tmp}};
+        for(index_t nIter = 0; nIter < NIterPerWarp; nIter++)
+        {
+            for(index_t kIter = 0; kIter < KIterPerWarp; kIter++)
+            {
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            }
+        }
+#else
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+#endif
+        // check C-block-distribution
+        static_assert(
+            std::is_same_v<remove_cvref_t<decltype(c_block_dstr_encode)>,
+                           remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
+                                                       .get_static_tile_distribution_encoding())>>,
+            "wrong!");
+        using BWarpDstr = typename WG::BWarpDstr;
+        using CWarpDstr = typename WG::CWarpDstr;
+        using BWarpTensor = typename WG::BWarpTensor;
+        using CWarpTensor = typename WG::CWarpTensor;
+        constexpr auto b_warp_y_lengths =
+            to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A Block window
+                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B block tensor
+                    BWarpTensor b_warp_tensor;
+                    b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+    }
+    CK_TILE_DEVICE constexpr auto MakeCBlockTile() const
+    {
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+        constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        // constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+    // C = A * B
+    template <typename ABlockWindowTmp, typename BBlockTensorTmp>
+    CK_TILE_DEVICE auto operator()(const ABlockWindowTmp& a_block_window_tmp,
+                                   const BBlockTensorTmp& b_block_tensor_tmp) const
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        operator()(c_block_tensor, a_block_window_tmp, b_block_tensor_tmp);
+        return c_block_tensor;
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+namespace ck_tile {
+template <typename AType_,
+          typename BType_,
+          typename CType_,
+          typename BlockWarps_,
+          typename WarpGemm_>
+struct BlockGemmASmemBRegCRegV1CustomPolicy
+{
+    using AType = remove_cvref_t<AType_>;
+    using BType = remove_cvref_t<BType_>;
+    using CType = remove_cvref_t<CType_>;
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+    static constexpr index_t kMWarps = BlockWarps::at(number<0>{});
+    static constexpr index_t kNWarps = BlockWarps::at(number<1>{});
+    static constexpr index_t kKWarps = BlockWarps::at(number<2>{});
+    using WarpGemm = remove_cvref_t<WarpGemm_>;
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        return make_tuple(WarpGemm{}, kMWarps, kNWarps);
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+namespace ck_tile {
+// Default policy for BlockGemmASmemBRegCRegV1
+// Default policy class should not be templated, put template on member functions instead
+struct BlockGemmASmemBRegCRegV1DefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        if constexpr(std::is_same_v<typename Problem::ADataType, half_t> &&
+                     std::is_same_v<typename Problem::BDataType, half_t> &&
+                     std::is_same_v<typename Problem::CDataType, float>)
+        {
+#if 0
+            constexpr index_t kBlockSize = Problem::kBlockSize;
+            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
+            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+            static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
+            constexpr index_t NumWarp = kBlockSize / get_warp_size();
+            // FIXME
+            if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
+                         kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
+            {
+                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
+            }
+            else
+            {
+                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
+            }
+#else
+            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
+#endif
+        }
+        else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
+                          std::is_same_v<typename Problem::BDataType, bf16_t> &&
+                          std::is_same_v<typename Problem::CDataType, float>)
+        {
+            return make_tuple(WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution{}, 4, 1);
+        }
+    }
+};
+} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_problem.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_problem.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
-#pragma once
-#include "ck_tile/core.hpp"
-namespace ck_tile {
-// Problem Description for BlockGemmASmemBSmemCRegV1
-template <typename ADataType_,
-          typename BDataType_,
-          typename CDataType_,
-          index_t kBlockSize_,
-          typename BlockGemmShape_>
-struct BlockGemmASmemBSmemCRegProblem
-{
-    using ADataType      = remove_cvref_t<ADataType_>;
-    using BDataType      = remove_cvref_t<BDataType_>;
-    using CDataType      = remove_cvref_t<CDataType_>;
-    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
-    static constexpr index_t kBlockSize = kBlockSize_;
-};
-} // namespace ck_tile
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once

--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_problem.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_problem.hpp
@@ -7,13 +7,13 @@
 namespace ck_tile {
-// Problem Description for BlockGemmARegBSmemCReg
+// Problem Description for BlockGemm
 template <typename ADataType_,
          typename BDataType_,
          typename CDataType_,
          index_t kBlockSize_,
          typename BlockGemmShape_>
-struct BlockGemmARegBSmemCRegProblem
+struct BlockGemmProblem
 {
    using ADataType      = remove_cvref_t<ADataType_>;
    using BDataType      = remove_cvref_t<BDataType_>;

--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -22,6 +22,9 @@ using WarpGemmMfmaF16F16F32M32N32K16 =
 using WarpGemmMfmaF16F16F32M16N16K32 =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplF16F16F32M16N16K16, 2>>;
+using WarpGemmMfmaF16F16F32M32N32K16SwizzleA = WarpGemmImpl<
+    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplF16F16F32M32N32K8, 2>>;
 using WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution = WarpGemmImpl<
    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplF16F16F32M32N32K8>>;
@@ -38,7 +41,7 @@ using WarpGemmMfmaF16F16F32M16N16K32TransposedCDistribution =
        WarpGemmAttributeMfmaImplF16F16F32M16N16K16,
        2>>;
-using WarpGemmMfmaF16F16F32M16N16K32SwizzleBTransposedCDistribution =
+using WarpGemmMfmaF16F16F32M32N32K16SwizzleBTransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
        WarpGemmAttributeMfmaImplF16F16F32M32N32K8,
        2>>;
@@ -56,6 +59,9 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16 =
 using WarpGemmMfmaBf16Bf16F32M16N16K32 =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateK<WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16, 2>>;
+using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA = WarpGemmImpl<
+    WarpGemmAtrributeMfmaIterateK_SwizzleA<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8, 2>>;
 using WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution = WarpGemmImpl<
    WarpGemmAtrributeMfmaTransposedCDistribution<WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8>>;
@@ -72,7 +78,7 @@ using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16,
        2>>;
-using WarpGemmMfmaBf16Bf16F32M16N16K32SwizzleBTransposedCDistribution =
+using WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleBTransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB<
        WarpGemmAttributeMfmaImplBf16Bf16F32M32N32K8,
        2>>;

--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp
@@ -468,4 +468,92 @@ struct WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution_SwizzleB
    }
 };
+template <typename WarpGemmAttributeMfmaImpl_, index_t kKIter, index_t SFactor_ = 2>
+struct WarpGemmAtrributeMfmaIterateK_SwizzleA
+{
+    using Impl = remove_cvref_t<WarpGemmAttributeMfmaImpl_>;
+    using ADataType = typename Impl::ADataType;
+    using BDataType = typename Impl::BDataType;
+    using CDataType = typename Impl::CDataType;
+    using AVecType =
+        ext_vector_t<ADataType, vector_traits<typename Impl::AVecType>::vector_size * kKIter>;
+    using BVecType =
+        ext_vector_t<BDataType, vector_traits<typename Impl::BVecType>::vector_size * kKIter>;
+    using CVecType = typename Impl::CVecType;
+    static constexpr index_t kM      = Impl::kM;
+    static constexpr index_t kN      = Impl::kN;
+    static constexpr index_t kK      = Impl::kK * kKIter;
+    static constexpr index_t SFactor = SFactor_; // group how many CM1 together
+    using AWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kAMLane / (Impl::kCMLane * SFactor * Impl::kCM1PerLane),
+                       Impl::kCMLane,
+                       SFactor,
+                       Impl::kCM1PerLane>,
+              sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1, 1, 1, 1>>,
+        tuple<sequence<0, 0, 2, 1, 3>>,
+        sequence<2>,
+        sequence<1>>;
+    using BWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kBNLane>, sequence<Impl::kABKLane, Impl::kABKPerLane * kKIter>>,
+        tuple<sequence<2, 1>>,
+        tuple<sequence<0, 0>>,
+        sequence<2>,
+        sequence<1>>;
+    using CWarpDstrEncoding = tile_distribution_encoding<
+        sequence<>,
+        tuple<sequence<Impl::kCM0PerLane / SFactor, Impl::kCMLane, Impl::kCM1PerLane * SFactor>,
+              sequence<Impl::kCNLane>>,
+        tuple<sequence<1, 2>>,
+        tuple<sequence<1, 0>>,
+        sequence<1, 1>,
+        sequence<0, 2>>;
+    // c_vec += a_vec * b_vec
+    CK_TILE_DEVICE void
+    operator()(CVecType& c_vec, const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        using buf_a = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b = thread_buffer<typename Impl::BVecType, kKIter>;
+        static_for<0, kKIter, 1>{}([&](auto iKIter) {
+            Impl{}(c_vec,
+                   reinterpret_cast<const buf_a&>(a_vec)
+                       .template get_as<typename Impl::AVecType>()[iKIter],
+                   reinterpret_cast<const buf_b&>(b_vec)
+                       .template get_as<typename Impl::BVecType>()[iKIter]);
+        });
+    }
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        constexpr auto I0 = number<0>{};
+        using buf_a       = thread_buffer<typename Impl::AVecType, kKIter>;
+        using buf_b       = thread_buffer<typename Impl::BVecType, kKIter>;
+        auto c_vec = Impl{}(
+            reinterpret_cast<const buf_a&>(a_vec).template get_as<typename Impl::AVecType>()[I0],
+            reinterpret_cast<const buf_b&>(b_vec).template get_as<typename Impl::BVecType>()[I0]);
+        static_for<1, kKIter, 1>{}([&](auto iKIter) {
+            Impl{}(c_vec,
+                   reinterpret_cast<const buf_a&>(a_vec)
+                       .template get_as<typename Impl::AVecType>()[iKIter],
+                   reinterpret_cast<const buf_b&>(b_vec)
+                       .template get_as<typename Impl::BVecType>()[iKIter]);
+        });
+        return c_vec;
+    }
+};
 } // namespace ck_tile