debug a

e076a320 · feifei14119 · 839cf897 · e076a320 · e076a320 · e076a320
Commit e076a320 authored Feb 10, 2025 by feifei14119
10 changed files
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -5,5 +5,5 @@ list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-flo
 list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
 list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-local-typedef)
 #list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -ggdb -g -O0 -v -save-temps)
-list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DFEIFEI_DEBUG=1)
+list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DFEIFEI_DEBUG=1 -DDEBUG_CNT=64)
 target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -183,9 +183,9 @@ int run_flatmm_example_with_layouts(int argc,
    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());

 #if FEIFEI_DEBUG
-    ck_tile::HostTensor<int> dbg_int({M * N * 64});
-    ck_tile::HostTensor<float> dbg_fp32({M * N * 64});
-    ck_tile::HostTensor<ADataType> dbg_f168({M * N * 64});
+    ck_tile::HostTensor<int> dbg_int({M * N * DEBUG_CNT});
+    ck_tile::HostTensor<float> dbg_fp32({M * N * DEBUG_CNT});
+    ck_tile::HostTensor<ADataType> dbg_f168({M * N * DEBUG_CNT});

    ck_tile::DeviceMem dbg_int_buf(dbg_int.get_element_space_size_in_bytes());
    ck_tile::DeviceMem dbg_fp32_buf(dbg_fp32.get_element_space_size_in_bytes());
@@ -362,7 +362,7 @@ int run_flatmm_example_with_layouts(int argc,
    int GridDimY  = 1;
    int BlockDimX = 64;
    int BlockDimY = 4;
-    int DbgCnt    = 64;
+    int DbgCnt    = DEBUG_CNT;
    int BlockSize = BlockDimX * BlockDimY;
    // a_host
    {

--- a/include/ck_tile/ops/flatmm.hpp
+++ b/include/ck_tile/ops/flatmm.hpp
@@ -3,40 +3,9 @@

 #pragma once

-#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1_default_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_default_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_custom_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
-#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
-#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
-#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
-// #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
 #include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
-#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
-// #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
-// #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp"
-#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp"
-// #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
 #include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
@@ -45,10 +14,15 @@
 #include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
 #include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"

-#include "ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp"
+// block
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp"
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp"
+// pipeline
 #include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp"
-#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/flatmm/pipeline/flatmm_universal_pipeline_ag_bg_cr_policy.hpp"
+// kernel
+#include "ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp"

 #include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"

--- a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_default_policy.hpp"
+
+namespace ck_tile {
+
+// A is block window on shared memory
+// B is block window on shared memory
+// C is block distributed tensor
+template <typename Problem_, typename BlockPolicy_ = BlockFlatmmASmemBSmemCRegV1DefaultPolicy>
+struct BlockFlatmmASmemBSmemCRegV1
+{
+    using Problem        = remove_cvref_t<Problem_>;
+    using BlockPolicy    = remove_cvref_t<BlockPolicy_>;
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
+
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr index_t MPerBlock = BlockGemmShape::kM;
+        constexpr index_t NPerBlock = BlockGemmShape::kN;
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
+
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+
+        auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
+        return c_block_tensor;
+    }
+
+#if 1
+    // C += A * B
+    // template <typename CBlockTensor, typename ABlockWindow, typename BBlockWindow>
+    template <typename ABlockWindow>
+    CK_TILE_DEVICE void operator()(const ABlockWindow& a_block_window
+#if FEIFEI_DEBUG
+                                   ,
+                                   const BDataType* b_ptr,
+                                   int* dbg_int,
+                                   float* dbg_fp32,
+                                   void* dbg_f168
+#endif
+    ) const
+    {
+#if FEIFEI_DEBUG
+        if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
+        {
+            printf("[BLOCK ] BlockFlatmmASmemBSmemCRegV1():\n");
+        }
+
+        uint32_t tidx = threadIdx.x;
+        uint32_t tidy = threadIdx.y;
+        uint32_t bidx = blockIdx.x;
+        uint32_t bidy = blockIdx.y;
+        uint32_t bdmx = blockDim.x;
+        uint32_t bdmy = blockDim.y;
+        uint32_t gdmx = gridDim.x;
+        uint32_t gdmy = gridDim.y;
+        uint32_t gid  = ((bdmx * bdmy) * gdmx) * bidy + (bdmx * bdmy) * bidx + bdmx * tidy + tidx;
+
+        half_t* dbg_f16 = static_cast<half_t*>(dbg_f168);
+        for(int i = 0; i < DEBUG_CNT; i++)
+        {
+            dbg_int[gid * DEBUG_CNT + i]  = -1;
+            dbg_fp32[gid * DEBUG_CNT + i] = -1.0f;
+            dbg_f16[gid * DEBUG_CNT + i] = ck_tile::type_convert<ck_tile::half_t>(-1.0f);
+        }
+#endif
+        /*
+        static_assert(std::is_same_v<ADataType, typename ABlockWindow::DataType> &&
+                          std::is_same_v<BDataType, typename BBlockWindow::DataType> &&
+                          std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                      "wrong!");
+        */
+
+        constexpr index_t MPerBlock = ABlockWindow{}.get_window_lengths()[number<0>{}];
+        // constexpr index_t NPerBlock = BBlockWindow{}.get_window_lengths()[number<0>{}];
+        constexpr index_t KPerBlock = ABlockWindow{}.get_window_lengths()[number<1>{}];
+
+        /*
+        static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN &&
+                          KPerBlock == BlockGemmShape::kK,
+                      "wrong!");
+        */
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+        using WG              = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        // constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr index_t MPerBlockPerIter = MPerBlock / MIterPerWarp;
+        // constexpr index_t NPerBlockPerIter = NPerBlock / NIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+        const index_t iNWarp = get_warp_id() % NWarp;
+
+        // construct A-warp-window
+        auto a_warp_window_tmp = make_tile_window(
+            a_block_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+            a_block_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        // Warp loop in block:
+        constexpr index_t kIter  = 0;
+        constexpr index_t mIter  = 0;
+        const auto a_warp_tensor = load_tile(a_warp_windows(number<mIter>{})(number<kIter>{}));
+
+#if 1
+        // feifei TODO: Implement gemm here
+#else
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr index_t MPerBlockPerIter = MPerBlock / MIterPerWarp;
+        constexpr index_t NPerBlockPerIter = NPerBlock / NIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+        const index_t iNWarp = get_warp_id() % NWarp;
+
+        // construct A-warp-window
+        auto a_warp_window_tmp = make_tile_window(
+            a_block_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+            a_block_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+#if 0 // FIXME: using array will cause register spill
+        array<array<decltype(a_warp_window_tmp), KIterPerWarp>, MIterPerWarp> a_warp_windows{
+            {a_warp_window_tmp}};
+
+        for(index_t mIter = 0; mIter < MIterPerWarp; mIter++)
+        {
+            for(index_t kIter = 0; kIter < KIterPerWarp; kIter++)
+            {
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            }
+        }
+#else
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+#endif
+
+        // construct B-warp-window
+        auto b_warp_window_tmp = make_tile_window(
+            b_block_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kN>{}, number<WG::kK>{}),
+            b_block_window.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0},
+            make_static_tile_distribution(typename WG::BWarpDstrEncoding{}));
+
+#if 0 // FIXME: using array will cause register spill
+        array<array<decltype(b_warp_window_tmp), KIterPerWarp>, NIterPerWarp> b_warp_windows{
+            {b_warp_window_tmp}};
+
+        for(index_t nIter = 0; nIter < NIterPerWarp; nIter++)
+        {
+            for(index_t kIter = 0; kIter < KIterPerWarp; kIter++)
+            {
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            }
+        }
+#else
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_warp_window_tmp), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_windows;
+
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+#endif
+
+        using CWarpDstr   = typename WG::CWarpDstr;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block window
+                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B Block window
+                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+#endif
+    }
+
+#else
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockWindow, typename BBlockWindow>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   const ABlockWindow& a_block_window,
+                                   const BBlockWindow& b_block_window) const
+    {
+        static_assert(std::is_same_v<ADataType, typename ABlockWindow::DataType> &&
+                          std::is_same_v<BDataType, typename BBlockWindow::DataType> &&
+                          std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                      "wrong!");
+
+        constexpr index_t MPerBlock = ABlockWindow{}.get_window_lengths()[number<0>{}];
+        constexpr index_t NPerBlock = BBlockWindow{}.get_window_lengths()[number<0>{}];
+        constexpr index_t KPerBlock = ABlockWindow{}.get_window_lengths()[number<1>{}];
+
+        static_assert(MPerBlock == BlockGemmShape::kM && NPerBlock == BlockGemmShape::kN &&
+                          KPerBlock == BlockGemmShape::kK,
+                      "wrong!");
+
+        constexpr auto config = BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();
+
+        using WG = remove_cvref_t<decltype(config.template at<0>())>;
+
+        constexpr index_t MWarp = config.template at<1>();
+        constexpr index_t NWarp = config.template at<2>();
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WG::kN);
+        constexpr index_t KIterPerWarp = KPerBlock / WG::kK;
+
+        constexpr index_t MPerBlockPerIter = MPerBlock / MIterPerWarp;
+        constexpr index_t NPerBlockPerIter = NPerBlock / NIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+        const index_t iNWarp = get_warp_id() % NWarp;
+
+        // construct A-warp-window
+        auto a_warp_window_tmp = make_tile_window(
+            a_block_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kM>{}, number<WG::kK>{}),
+            a_block_window.get_window_origin() + multi_index<2>{iMWarp * WG::kM, 0},
+            make_static_tile_distribution(typename WG::AWarpDstrEncoding{}));
+
+#if 0 // FIXME: using array will cause register spill
+        array<array<decltype(a_warp_window_tmp), KIterPerWarp>, MIterPerWarp> a_warp_windows{
+            {a_warp_window_tmp}};
+
+        for(index_t mIter = 0; mIter < MIterPerWarp; mIter++)
+        {
+            for(index_t kIter = 0; kIter < KIterPerWarp; kIter++)
+            {
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            }
+        }
+#else
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+#endif
+
+        // construct B-warp-window
+        auto b_warp_window_tmp = make_tile_window(
+            b_block_window.get_bottom_tensor_view(),
+            make_tuple(number<WG::kN>{}, number<WG::kK>{}),
+            b_block_window.get_window_origin() + multi_index<2>{iNWarp * WG::kN, 0},
+            make_static_tile_distribution(typename WG::BWarpDstrEncoding{}));
+
+#if 0 // FIXME: using array will cause register spill
+        array<array<decltype(b_warp_window_tmp), KIterPerWarp>, NIterPerWarp> b_warp_windows{
+            {b_warp_window_tmp}};
+
+        for(index_t nIter = 0; nIter < NIterPerWarp; nIter++)
+        {
+            for(index_t kIter = 0; kIter < KIterPerWarp; kIter++)
+            {
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            }
+        }
+#else
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_warp_window_tmp), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_windows;
+
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+#endif
+
+        using CWarpDstr   = typename WG::CWarpDstr;
+        using CWarpTensor = typename WG::CWarpTensor;
+
+        constexpr auto c_warp_y_lengths =
+            to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+        constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // read A warp tensor from A block window
+                const auto a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // read B warp tensor from B Block window
+                    const auto b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+
+                    // read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                    // warp GEMM
+                    WG{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                    // write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+    }
+
+    // C = A * B
+    template <typename ABlockTensorTmp, typename BBlockWindow>
+    CK_TILE_DEVICE auto operator()(const ABlockTensorTmp& a_block_tensor_tmp,
+                                   const BBlockWindow& b_block_window) const
+    {
+        auto c_block_tensor = MakeCBlockTile();
+        operator()(c_block_tensor, a_block_tensor_tmp, b_block_window);
+        return c_block_tensor;
+    }
+#endif
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_custom_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// Default policy for BlockGemmASmemBSmemCRegV1
+// Default policy class should not be templated, put template on member functions instead
+template <typename AType_,
+          typename BType_,
+          typename CType_,
+          typename BlockWarps_,
+          typename WarpGemm_>
+struct BlockFlatmmASmemBSmemCRegV1CustomPolicy
+{
+    using AType = remove_cvref_t<AType_>;
+    using BType = remove_cvref_t<BType_>;
+    using CType = remove_cvref_t<CType_>;
+
+    using BlockWarps = remove_cvref_t<BlockWarps_>;
+
+    static constexpr index_t kMWarps = BlockWarps::at(number<0>{});
+    static constexpr index_t kNWarps = BlockWarps::at(number<1>{});
+    static constexpr index_t kKWarps = BlockWarps::at(number<2>{});
+
+    using WarpGemm = remove_cvref_t<WarpGemm_>;
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        return make_tuple(WarpGemm{}, kMWarps, kNWarps);
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/flatmm/block/block_flatmm_asmem_bsmem_creg_v1_default_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+
+namespace ck_tile {
+
+// Default policy for BlockGemmASmemBSmemCRegV1
+// Default policy class should not be templated, put template on member functions instead
+struct BlockFlatmmASmemBSmemCRegV1DefaultPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        if constexpr(std::is_same_v<typename Problem::ADataType, half_t> &&
+                     std::is_same_v<typename Problem::BDataType, half_t> &&
+                     std::is_same_v<typename Problem::CDataType, float>)
+        {
+#if 0
+            constexpr index_t kBlockSize = Problem::kBlockSize;
+
+            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
+            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+            static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
+
+            constexpr index_t NumWarp = kBlockSize / get_warp_size();
+
+            if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
+                         kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
+            {
+                return make_tuple(WarpGemmMfmaF16F16F32M32N32K16{}, 2, 2);
+            }
+            else
+            {
+                return make_tuple(WarpGemmMfmaF16F16F32M32N32K16{}, 2, 2);
+            }
+#else
+            return make_tuple(WarpGemmMfmaF16F16F32M32N32K16TransposedCDistribution{}, 4, 1);
+#endif
+        }
+        else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
+                          std::is_same_v<typename Problem::BDataType, bf16_t> &&
+                          std::is_same_v<typename Problem::CDataType, float>)
+        {
+            return make_tuple(WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution{}, 4, 1);
+        }
+        else
+        {
+            static_assert(false, "Unsupported data type configuration for GEMM warp execution.");
+        }
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
@@ -85,7 +85,7 @@ struct FlatmmKernel

    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
    {
-        return TilePartitioner::GridSize(M, N);
+        return TilePartitioner::GridSize(M, N); // feifei TODO: split K here
        // return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
    }

@@ -178,7 +178,7 @@ struct FlatmmKernel

        index_t a_k_split_offset;
        index_t b_k_split_offset;
-        index_t splitted_k;
+        index_t splitted_k; // problem K after splitted
    };

    CK_TILE_HOST static bool IsSupportedArgument(const FlatmmKernelArgs& kargs)
@@ -473,7 +473,7 @@ struct FlatmmKernel
                                         const BDataType* b_ptr,
                                         int* dbg_int,
                                         float* dbg_fp32,
-                                         short* dbg_f168
+                                         void* dbg_f168
 #endif
    )
    {
@@ -481,12 +481,55 @@ struct FlatmmKernel
        // Create Flatmm tensor views, pad views and tile windows
        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews<DstInMemOp>(
            a_ptr, b_shuffle_ptr, c_ptr, kargs, splitk_batch_offset);
-        // origin layout
-        // const auto& gemm_tensor_views_tuple =
-        //    MakeGemmTensorViews<DstInMemOp>(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+        // Debug origin layout
+        // const auto& gemm_tensor_views_tuple = MakeGemmTensorViews<DstInMemOp>(
+        //  a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);

        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
-        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const auto& gemm_tile_windows =
+            MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+
+#if FEIFEI_DEBUG
+        ////////////////////////////////////////////////////////
+        const auto& a_gemm_tensor_views = gemm_tensor_views_tuple.at(I0);   // tensor_view
+        const auto& a_gemm_tensor_desc = a_gemm_tensor_views.desc_;         // tensor_descriptor
+        const auto& a_gemm_buff_views = a_gemm_tensor_views.buf_;           // buffer_view
+        if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
+        {
+            printf("[KERNEL] a_gemm_tensor_view: size = %ld, len = [%d, %d], top = [%d, %d], upper = %d, lower = %d\n",
+                    a_gemm_tensor_desc.get_element_space_size(),
+                    a_gemm_tensor_desc.get_length(I0), a_gemm_tensor_desc.get_length(I1),
+                    a_gemm_tensor_desc.get_top_dimension_hidden_ids()[0], a_gemm_tensor_desc.get_top_dimension_hidden_ids()[1],
+                    a_gemm_tensor_desc.get_upper_dimension_hidden_idss()(I0)[0],
+                    a_gemm_tensor_desc.get_lower_dimension_hidden_idss()(I0)[0]
+                   );
+        }
+
+        const auto& a_pad_tensor_views = gemm_pad_views.at(I0);     // tensor_view
+        const auto& a_pad_tensor_desc = a_pad_tensor_views.desc_;   // tensor_descriptor
+        const auto& a_pad_buff_views = a_pad_tensor_views.buf_;     // buffer_view
+        if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
+        {
+            printf("[KERNEL] a_pad_tensor_view:  size = %ld, len = [%d, %d], top = [%d, %d], upper = %d, lower = %d\n",
+                    a_pad_tensor_desc.get_element_space_size(),
+                    a_pad_tensor_desc.get_length(I0), a_pad_tensor_desc.get_length(I1),
+                    a_pad_tensor_desc.get_top_dimension_hidden_ids()[0], a_pad_tensor_desc.get_top_dimension_hidden_ids()[1],
+                    a_pad_tensor_desc.get_upper_dimension_hidden_idss()(I0)[0],
+                    a_pad_tensor_desc.get_lower_dimension_hidden_idss()(I0)[0]
+                   );
+        }
+
+        const auto& a_tile_win = gemm_tile_windows.at(I0);     // tile_window_with_static_lengths
+        if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
+        {
+            printf("[KERNEL] a_gemm_tile_window: dim_num = %d\n",
+                    a_tile_win.get_num_of_dimension()
+                   );
+        }
+        ////////////////////////////////////////////////////////
+#endif

        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);

@@ -555,11 +598,14 @@ struct FlatmmKernel

        int* dbg_int    = static_cast<int*>(kargs.dbg_int_ptr);
        float* dbg_fp32 = static_cast<float*>(kargs.dbg_fp32_ptr);
-        short* dbg_f168 = static_cast<short*>(kargs.dbg_f168_ptr);
+        half_t* dbg_f16 = static_cast<half_t*>(kargs.dbg_f168_ptr);

-        dbg_int[gid]  = 1;
-        dbg_fp32[gid] = 1.0f;
-        dbg_f168[gid] = ck_tile::type_convert<ck_tile::half_t>(1.0f);
+        for(int i = 0; i < DEBUG_CNT; i++)
+        {
+            dbg_int[gid * DEBUG_CNT + i]  = 0;
+            dbg_fp32[gid * DEBUG_CNT + i] = .0f;
+            dbg_f16[gid * DEBUG_CNT + i]  = ck_tile::type_convert<ck_tile::half_t>(0.f);
+        }
 #endif

        const auto [iM, iN] = TilePartitioner::GetOutputTileIndex(blockIdx.x, blockIdx.y);
@@ -592,7 +638,7 @@ struct FlatmmKernel
                      b_ptr,
                      dbg_int,
                      dbg_fp32,
-                      dbg_f168
+                      kargs.dbg_f168_ptr
 #endif
            );
        }
@@ -611,7 +657,7 @@ struct FlatmmKernel
                                                         b_ptr,
                                                         dbg_int,
                                                         dbg_fp32,
-                                                         dbg_f168
+                                                         kargs.dbg_f168_ptr
 #endif
            );
        }

--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -4,14 +4,14 @@
 #pragma once

 #include "ck_tile/core.hpp"
-#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_universal_pipeline_ag_bg_cr_policy.hpp"

 namespace ck_tile {

 //  A Tile Window: global memory
 //  B Tile Window: global memory
 //  C Distributed tensor: register
-template <typename Problem, typename Policy = FlatmmPipelineAGmemBGmemCRegV1DefaultPolicy>
+template <typename Problem, typename PipelinePolicy = UniversalFlatmmPipelineAgBgCrPolicy> // feifei TODO: add default policy
 struct FlatmmPipelineAGmemBGmemCRegV1
 {
    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
@@ -23,7 +23,8 @@ struct FlatmmPipelineAGmemBGmemCRegV1
    using BLayout = remove_cvref_t<typename Problem::BLayout>;
    using CLayout = remove_cvref_t<typename Problem::CLayout>;

-    using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
+    using BlockFlatmm =
+        remove_cvref_t<decltype(PipelinePolicy::template GetBlockFlatmm<Problem>())>;

    static constexpr index_t BlockSize = Problem::kBlockSize;

@@ -41,21 +42,24 @@ struct FlatmmPipelineAGmemBGmemCRegV1

    CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
    {
-        return integer_divide_ceil(
-                   sizeof(ADataType) *
-                       Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
-                   16) *
+        return integer_divide_ceil(sizeof(ADataType) *
+                                       PipelinePolicy::template MakeALdsBlockDescriptor<Problem>()
+                                           .get_element_space_size(),
+                                   16) *
                   16 +
-               sizeof(BDataType) *
-                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+               sizeof(BDataType) * PipelinePolicy::template MakeBLdsBlockDescriptor<Problem>()
+                                       .get_element_space_size();
    }

    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
    {
-        return Policy::template GetSmemSize<Problem>();
+        return PipelinePolicy::template GetSmemSize<Problem>();
    }

-    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); }
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC()
+    {
+        return PipelinePolicy::IsTransposeC();
+    }

    template <typename ADramBlockWindowTmp,
              typename BDramBlockWindowTmp,
@@ -72,7 +76,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
                                        const BDataType* b_ptr,
                                        int* dbg_int,
                                        float* dbg_fp32,
-                                        short* dbg_f168
+                                        void* dbg_f168
 #endif
    ) const
    {
@@ -80,6 +84,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
        if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
        {
            printf("[PIPELN] FlatmmPipelinen():\n");
+            printf("[PIPELN] num_loop = %d\n", num_loop);
        }

        uint32_t tidx = threadIdx.x;
@@ -92,9 +97,13 @@ struct FlatmmPipelineAGmemBGmemCRegV1
        uint32_t gdmy = gridDim.y;
        uint32_t gid  = ((bdmx * bdmy) * gdmx) * bidy + (bdmx * bdmy) * bidx + bdmx * tidy + tidx;

-        dbg_int[gid]  = -1;
-        dbg_fp32[gid] = -1.0f;
-        dbg_f168[gid] = ck_tile::type_convert<ck_tile::half_t>(-1.0f);
+        half_t* dbg_f16 = static_cast<half_t*>(dbg_f168);
+        for(int i = 0; i < DEBUG_CNT; i++)
+        {
+            dbg_int[gid * DEBUG_CNT + i]  = 1;
+            dbg_fp32[gid * DEBUG_CNT + i] = 1.0f;
+            dbg_f16[gid * DEBUG_CNT + i] = ck_tile::type_convert<ck_tile::half_t>(1.0f);
+        }
 #endif
        static_assert(
            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
@@ -108,12 +117,93 @@ struct FlatmmPipelineAGmemBGmemCRegV1

 #if 1
        // feifei TODO: Implement gemm here
+        // Get block flatmm
+        auto block_flatmm = BlockFlatmm(); // struct BlockFlatmmASmemBSmemCRegV1
+
+        // A tile in LDS
+        ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
+
+        constexpr auto a_lds_block_desc =
+            PipelinePolicy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+        constexpr index_t a_lds_block_space_size_aligned =
+            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
+            16;
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());
+
+        // A LDS tile window for store
+        auto a_copy_lds_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // A LDS tile for block GEMM
+        auto a_lds_gemm_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // Prefetch -----------------------------------------------------------
+        // global read 0
+        auto a_block_tile = load_tile(a_copy_dram_window);
+
+#if FEIFEI_DEBUG // debug A global load
+        int a_dim = a_block_tile.get_num_of_dimension();
+        int a_sz = a_block_tile.get_thread_buffer_size();
+
+        if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
+        {
+            printf("[PIPELN] a_dim = %d, a_sz = %d\n", a_dim, a_sz);
+        }
+        for(auto i = 0; i < a_sz; i++)
+        {
+            dbg_f16[gid * DEBUG_CNT + i] = a_block_tile.get_thread_buffer()[i];
+        }
+        return nullptr;
+#endif
+
+        // move to 1
+        move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+
+        // LDS write 0
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>)
+        {
+            auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                PipelinePolicy::template MakeShuffledARegBlockDescriptor<Problem>());
+            shuffle_tile(a_shuffle_tmp, a_block_tile);
+            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+        }
+        else
+        {
+            store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile));
+        }
+
+        // Loop ---------------------------------------------------------------
+        // Do flatmm
+        block_flatmm(a_lds_gemm_window
+#if FEIFEI_DEBUG
+                     ,
+                     b_ptr,
+                     dbg_int,
+                     dbg_fp32,
+                     dbg_f168
+#endif
+        );
+
+        // Tail ---------------------------------------------------------------
+
        return nullptr;
 #else
        // A tile in LDS
        ADataType* p_a_lds = static_cast<ADataType*>(p_smem);

-        constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
+        constexpr auto a_lds_block_desc =
+            PipelinePolicy::template MakeALdsBlockDescriptor<Problem>();

        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);

@@ -125,7 +215,8 @@ struct FlatmmPipelineAGmemBGmemCRegV1
        BDataType* p_b_lds = static_cast<BDataType*>(
            static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));

-        constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
+        constexpr auto b_lds_block_desc =
+            PipelinePolicy::template MakeBLdsBlockDescriptor<Problem>();

        auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);

@@ -134,7 +225,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
                             a_dram_block_window_tmp.get_window_origin(),
-                             Policy::template MakeADramTileDistribution<Problem>());
+                             PipelinePolicy::template MakeADramTileDistribution<Problem>());

        // A LDS tile window for store
        auto a_copy_lds_window = make_tile_window(
@@ -145,7 +236,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
            make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
                             make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
                             b_dram_block_window_tmp.get_window_origin(),
-                             Policy::template MakeBDramTileDistribution<Problem>());
+                             PipelinePolicy::template MakeBDramTileDistribution<Problem>());

        // B LDS tile window for store
        auto b_copy_lds_window = make_tile_window(
@@ -184,7 +275,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>)
            {
                auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
-                    Policy::template MakeShuffledARegBlockDescriptor<Problem>());
+                    PipelinePolicy::template MakeShuffledARegBlockDescriptor<Problem>());
                shuffle_tile(a_shuffle_tmp, a_block_tile);
                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
                store_tile(a_copy_lds_window, a_block_tile_tmp);
@@ -198,7 +289,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
            {
                auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
-                    Policy::template MakeShuffledBRegBlockDescriptor<Problem>());
+                    PipelinePolicy::template MakeShuffledBRegBlockDescriptor<Problem>());
                shuffle_tile(b_shuffle_tmp, b_block_tile);
                const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_shuffle_tmp);
                store_tile(b_copy_lds_window, b_block_tile_tmp);
@@ -235,7 +326,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
            {
                auto b_shuffle_tmp_loop = make_static_distributed_tensor<BDataType>(
-                    Policy::template MakeShuffledBRegBlockDescriptor<Problem>());
+                    PipelinePolicy::template MakeShuffledBRegBlockDescriptor<Problem>());
                shuffle_tile(b_shuffle_tmp_loop, b_block_tile);
                store_tile(b_copy_lds_window,
                           tile_elementwise_in(b_element_func, b_shuffle_tmp_loop));
@@ -271,7 +362,7 @@ struct FlatmmPipelineAGmemBGmemCRegV1
                                   const BDataType* b_ptr,
                                   int* dbg_int,
                                   float* dbg_fp32,
-                                   short* dbg_f168
+                                   void* dbg_f168
 #endif
    ) const
    {

--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
-
-namespace ck_tile {
-
-// Default policy for GemmPipelineAGmemBGmemCRegV1
-// Default policy class should not be templated, put template on member functions instead
-struct FlatmmPipelineAGmemBGmemCRegV1DefaultPolicy
-{
-    static constexpr auto I0 = number<0>{};
-    static constexpr auto I1 = number<1>{};
-    static constexpr auto I2 = number<2>{};
-
-    static constexpr bool TransposeC = true;
-
-#if 0
-    // 2d
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
-    {
-        using namespace ck_tile;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr auto a_lds_block_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(kMPerBlock, kKPerBlock), number<32>{});
-
-        return a_lds_block_desc;
-    }
-
-    // 2d
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
-    {
-        using namespace ck_tile;
-
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr auto b_lds_block_desc =
-            make_naive_tensor_descriptor_packed(make_tuple(kNPerBlock, kKPerBlock), number<32>{});
-
-        return b_lds_block_desc;
-    }
-#elif 1
-    // 3d + padding
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
-    {
-        using namespace ck_tile;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        // TODO: this 8 is AK1! should be a policy parameter!
-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / 8>{}, number<kMPerBlock>{}, number<8>{}),
-            make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
-            number<8>{},
-            number<1>{});
-
-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(make_pass_through_transform(kMPerBlock),
-                       make_merge_transform(make_tuple(kKPerBlock / 8, 8))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return a_lds_block_desc;
-    }
-
-    // 3d + padding
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
-    {
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / 8>{}, number<kNPerBlock>{}, number<8>{}),
-            make_tuple(number<(kNPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
-            number<8>{},
-            number<1>{});
-
-        constexpr auto b_lds_block_desc = transform_tensor_descriptor(
-            b_lds_block_desc_0,
-            make_tuple(make_pass_through_transform(kNPerBlock),
-                       make_merge_transform(make_tuple(kKPerBlock / 8, 8))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return b_lds_block_desc;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
-    {
-        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
-                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
-        return smem_size_a;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
-    {
-        constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) *
-                                        MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
-        return smem_size_b;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
-    {
-        constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
-        constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
-        constexpr index_t smem_size   = smem_size_a + smem_size_b;
-
-        return smem_size;
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
-    {
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
-        return Problem::VectorLoadSize / sizeof(ADataType);
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackB()
-    {
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-        return Problem::VectorLoadSize / sizeof(BDataType);
-    }
-#elif 1
-    // fake XOR
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
-    {
-        using namespace ck_tile;
-
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr auto a_lds_block_desc_d1_d2_d3 = make_naive_tensor_descriptor_packed(
-            make_tuple(number<kMPerBlock / 2>{}, number<2>{}, number<kKPerBlock>{}),
-            number<kKPerBlock>{});
-
-        constexpr index_t kK1 = 16 / sizeof(ADataType);
-
-        constexpr auto a_lds_block_desc_d4_d5_d6 = transform_tensor_descriptor(
-            a_lds_block_desc_d1_d2_d3,
-            make_tuple(
-                make_xor_transform(make_tuple(number<kMPerBlock / 2>{}, number<kKPerBlock>{}), kK1),
-                make_pass_through_transform(2)),
-            make_tuple(sequence<0, 2>{}, sequence<1>{}),
-            make_tuple(sequence<0, 2>{}, sequence<1>{}));
-
-        constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor(
-            a_lds_block_desc_d4_d5_d6,
-            make_tuple(make_merge_transform(make_tuple(number<kMPerBlock / 2>{}, number<2>{})),
-                       make_pass_through_transform(kKPerBlock)),
-            make_tuple(sequence<0, 1>{}, sequence<2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return a_lds_block_desc_m_k;
-    }
-
-    // fake XOR
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
-    {
-        using namespace ck_tile;
-
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr auto b_lds_block_desc_d1_d2_d3 = make_naive_tensor_descriptor_packed(
-            make_tuple(number<kNPerBlock / 2>{}, number<2>{}, number<kKPerBlock>{}),
-            number<kKPerBlock>{});
-
-        constexpr index_t kK1 = 16 / sizeof(BDataType);
-
-        constexpr auto b_lds_block_desc_d4_d5_d6 = transform_tensor_descriptor(
-            b_lds_block_desc_d1_d2_d3,
-            make_tuple(
-                make_xor_transform(make_tuple(number<kNPerBlock / 2>{}, number<kKPerBlock>{}), kK1),
-                make_pass_through_transform(2)),
-            make_tuple(sequence<0, 2>{}, sequence<1>{}),
-            make_tuple(sequence<0, 2>{}, sequence<1>{}));
-
-        constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
-            b_lds_block_desc_d4_d5_d6,
-            make_tuple(make_merge_transform(make_tuple(number<kNPerBlock / 2>{}, number<2>{})),
-                       make_pass_through_transform(kKPerBlock)),
-            make_tuple(sequence<0, 1>{}, sequence<2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
-
-        return b_lds_block_desc_n_k;
-    }
-#endif
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
-    {
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
-        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
-
-        constexpr index_t BlockSize = Problem::kBlockSize;
-
-        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-
-        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
-        {
-            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
-            constexpr index_t M0           = MPerBlock / M1;
-            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
-            static_assert(total_pixels % M1 == 0);
-            constexpr index_t K3    = total_pixels / M1;
-            constexpr index_t KPack = GetSmemPackA<Problem>();
-            static_assert(KPack % K3 == 0);
-            constexpr index_t K2 = KPack / K3;
-            if constexpr(get_warp_size() % (K2 * M0))
-            {
-                constexpr index_t K1 = get_warp_size() / (K2 * M0);
-                constexpr index_t K0 = BlockSize / get_warp_size();
-                static_assert(KPerBlock == K0 * K1 * K2 * K3);
-                return make_static_tile_distribution(
-                    tile_distribution_encoding<sequence<1>,
-                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
-                                               tuple<sequence<2>, sequence<2, 1, 2>>,
-                                               tuple<sequence<0>, sequence<1, 0, 2>>,
-                                               sequence<2, 1>,
-                                               sequence<3, 1>>{});
-            }
-            else
-            {
-                constexpr index_t K1   = (K2 * M0) / get_warp_size();
-                constexpr index_t K2_m = K2 / K1;
-                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
-                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
-                return make_static_tile_distribution(
-                    tile_distribution_encoding<sequence<1>,
-                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
-                                               tuple<sequence<2, 2>, sequence<1, 2>>,
-                                               tuple<sequence<0, 1>, sequence<0, 2>>,
-                                               sequence<2, 1>,
-                                               sequence<3, 1>>{});
-            }
-        }
-        else
-        {
-            constexpr index_t K1 = 16 / sizeof(ADataType);
-            constexpr index_t K0 = KPerBlock / K1;
-            constexpr index_t M2 = get_warp_size() / K0;
-            // coalesce reading for each blocks
-            if constexpr(get_warp_size() % (M2 * K0) == 0)
-            {
-                constexpr index_t M1 = BlockSize / get_warp_size();
-                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
-                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
-                constexpr index_t M0 = MPerBlock / (M2 * M1);
-                static_assert(M0 * M1 * M2 == MPerBlock,
-                              "Incorrect M0, M2, M1 configuration! "
-                              "M0, M1, M2 must cover whole MPerBlock!");
-
-                return make_static_tile_distribution(
-                    tile_distribution_encoding<sequence<1>,
-                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
-                                               tuple<sequence<1>, sequence<1, 2>>,
-                                               tuple<sequence<1>, sequence<2, 0>>,
-                                               sequence<1, 2>,
-                                               sequence<0, 1>>{});
-            }
-            else
-            {
-                constexpr index_t M0 = BlockSize / get_warp_size();
-                constexpr index_t M1 = MPerBlock / (M2 * M0);
-                static_assert(M0 * M1 * M2 == MPerBlock,
-                              "Incorrect M0, M1, M2 configuration! "
-                              "M0, M1, M2 must cover whole MPerBlock!");
-                return make_static_tile_distribution(
-                    tile_distribution_encoding<sequence<1>,
-                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
-                                               tuple<sequence<1>, sequence<1, 2>>,
-                                               tuple<sequence<0>, sequence<2, 0>>,
-                                               sequence<1, 2>,
-                                               sequence<1, 1>>{});
-            }
-        }
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
-    {
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
-
-        constexpr index_t BlockSize = Problem::kBlockSize;
-
-        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-
-        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
-        {
-            constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
-            constexpr index_t N0           = NPerBlock / N1;
-            constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
-            static_assert(total_pixels % N1 == 0);
-            constexpr index_t K3    = total_pixels / N1;
-            constexpr index_t KPack = GetSmemPackB<Problem>();
-            static_assert(KPack % K3 == 0);
-            constexpr index_t K2 = KPack / K3;
-            if constexpr(get_warp_size() % (K2 * N0) == 0)
-            {
-                constexpr index_t K1 = get_warp_size() / (K2 * N0);
-                constexpr index_t K0 = BlockSize / get_warp_size();
-                static_assert(KPerBlock == K0 * K1 * K2 * K3);
-                return make_static_tile_distribution(
-                    tile_distribution_encoding<sequence<1>,
-                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
-                                               tuple<sequence<2>, sequence<2, 1, 2>>,
-                                               tuple<sequence<0>, sequence<1, 0, 2>>,
-                                               sequence<2, 1>,
-                                               sequence<3, 1>>{});
-            }
-            else
-            {
-                constexpr index_t K1   = (K2 * N0) / get_warp_size();
-                constexpr index_t K2_m = K2 / K1;
-                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
-                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
-                return make_static_tile_distribution(
-                    tile_distribution_encoding<sequence<1>,
-                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
-                                               tuple<sequence<2, 2>, sequence<1, 2>>,
-                                               tuple<sequence<0, 1>, sequence<0, 2>>,
-                                               sequence<2, 1>,
-                                               sequence<3, 1>>{});
-            }
-        }
-        else
-        {
-
-            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(BDataType);
-            constexpr index_t K0 = KPerBlock / K1;
-            constexpr index_t N2 = get_warp_size() / K0;
-            // coalesce reading for each blocks
-            if constexpr(get_warp_size() % (N2 * K0) == 0)
-            {
-                constexpr index_t N1 = BlockSize / get_warp_size();
-                static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
-                static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
-                constexpr index_t N0 = NPerBlock / (N2 * N1);
-                static_assert(N0 * N1 * N2 == NPerBlock,
-                              "Incorrect N0, N1, N2 configuration! "
-                              "N0, N1, N2 must cover whole NPerBlock!");
-
-                return make_static_tile_distribution(
-                    tile_distribution_encoding<sequence<1>,
-                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                               tuple<sequence<1>, sequence<1, 2>>,
-                                               tuple<sequence<1>, sequence<2, 0>>,
-                                               sequence<1, 2>,
-                                               sequence<0, 1>>{});
-            }
-            // coalesce reading for each warps
-            else
-            {
-                constexpr index_t N0 = BlockSize / get_warp_size();
-                constexpr index_t N1 = NPerBlock / (N2 * N0);
-                static_assert(N0 * N1 * N2 == NPerBlock,
-                              "Incorrect N0, N1, N2 configuration! "
-                              "N0, N1, N2 must cover whole NPerBlock!");
-                return make_static_tile_distribution(
-                    tile_distribution_encoding<sequence<1>,
-                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
-                                               tuple<sequence<1>, sequence<1, 2>>,
-                                               tuple<sequence<0>, sequence<2, 0>>,
-                                               sequence<1, 2>,
-                                               sequence<1, 1>>{});
-            }
-        }
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBRegBlockDescriptor()
-    {
-        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-        static_assert(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>);
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
-        constexpr index_t N0           = kNPerBlock / N1;
-        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-        static_assert(total_pixels % N1 == 0);
-        constexpr index_t K3     = total_pixels / N1;
-        constexpr index_t kKPack = GetSmemPackB<Problem>();
-        static_assert(kKPack % K3 == 0);
-        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
-        constexpr index_t warp_size = get_warp_size();
-        if constexpr(warp_size % (K2 * N0) == 0)
-        {
-            constexpr index_t K1 = warp_size / (K2 * N0);
-            constexpr index_t K0 = kBlockSize / warp_size;
-
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<1>,
-                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
-                                           tuple<sequence<2>, sequence<2, 1, 2>>,
-                                           tuple<sequence<0>, sequence<1, 0, 2>>,
-                                           sequence<1, 2>,
-                                           sequence<1, 3>>{});
-        }
-        else
-        {
-            constexpr index_t K1   = (K2 * N0) / get_warp_size();
-            constexpr index_t K2_m = K2 / K1;
-            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
-            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<1>,
-                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
-                                           tuple<sequence<2, 2>, sequence<1, 2>>,
-                                           tuple<sequence<0, 1>, sequence<0, 2>>,
-                                           sequence<1, 2>,
-                                           sequence<1, 3>>{});
-        }
-    }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDescriptor()
-    {
-        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
-        using ADataType = remove_cvref_t<typename Problem::ADataType>;
-        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>);
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
-        constexpr index_t M0           = kMPerBlock / M1;
-        constexpr index_t total_pixels = kMPerBlock * kKPerBlock / kBlockSize;
-        static_assert(total_pixels % M1 == 0);
-        constexpr index_t K3     = total_pixels / M1;
-        constexpr index_t kKPack = GetSmemPackA<Problem>();
-        static_assert(kKPack % K3 == 0);
-        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
-        constexpr index_t warp_size = get_warp_size();
-        if constexpr(warp_size % (K2 * M0) == 0)
-        {
-            constexpr index_t K1 = warp_size / (K2 * M0);
-            constexpr index_t K0 = kBlockSize / warp_size;
-
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<1>,
-                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
-                                           tuple<sequence<2>, sequence<2, 1, 2>>,
-                                           tuple<sequence<0>, sequence<1, 0, 2>>,
-                                           sequence<1, 2>,
-                                           sequence<1, 3>>{});
-        }
-        else
-        {
-            constexpr index_t K1   = (K2 * M0) / get_warp_size();
-            constexpr index_t K2_m = K2 / K1;
-            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
-            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
-            return make_static_tile_distribution(
-                tile_distribution_encoding<sequence<1>,
-                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
-                                           tuple<sequence<2, 2>, sequence<1, 2>>,
-                                           tuple<sequence<0, 1>, sequence<0, 2>>,
-                                           sequence<1, 2>,
-                                           sequence<1, 3>>{});
-        }
-    }
-
-    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; }
-
-    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
-    {
-        using AccDataType     = float;
-        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                       typename Problem::BDataType,
-                                                       AccDataType,
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       TransposeC>;
-        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
-                                                                      typename Problem::BDataType,
-                                                                      typename Problem::CDataType,
-                                                                      BlockWarps,
-                                                                      WarpGemm>;
-
-        return BlockUniversalGemmAsBsCr<Problem, BlockGemmPolicy>{};
-    }
-};
-
-} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -446,24 +446,25 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; }

    template <typename Problem>
-    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockFlatmm()
    {
-        using AccDataType     = float;
-        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
-        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                       typename Problem::BDataType,
-                                                       AccDataType,
-                                                       WarpTile::at(I0),
-                                                       WarpTile::at(I1),
-                                                       WarpTile::at(I2),
-                                                       TransposeC>;
-        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
-                                                                      typename Problem::BDataType,
-                                                                      typename Problem::CDataType,
-                                                                      BlockWarps,
-                                                                      WarpGemm>;
-        return BlockGemmASmemBSmemCRegV1<Problem, BlockGemmPolicy>{};
+        using AccDataType = float;
+        using BlockWarps  = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile    = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm    = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                   typename Problem::BDataType,
+                                                   AccDataType,
+                                                   WarpTile::at(I0),
+                                                   WarpTile::at(I1),
+                                                   WarpTile::at(I2),
+                                                   TransposeC>;
+        using BlockFlatmmPolicy =
+            BlockFlatmmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                    typename Problem::BDataType,
+                                                    typename Problem::CDataType,
+                                                    BlockWarps,
+                                                    WarpGemm>;
+        return BlockFlatmmASmemBSmemCRegV1<Problem, BlockFlatmmPolicy>{};
    }
 };