[flatmm] implement framwork

839cf897 · feifei14119 · 0e5e29c4 · 839cf897 · 839cf897 · 839cf897
Commit 839cf897 authored Feb 08, 2025 by feifei14119
11 changed files
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
+add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
+
+set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
+list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
+list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-local-typedef)
+#list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -ggdb -g -O0 -v -save-temps)
+list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DFEIFEI_DEBUG=1)
+target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
--- a/example/ck_tile/18_flatmm/README.md
+++ b/example/ck_tile/18_flatmm/README.md
+# FLATMM Matrix Multiplication
+
+This folder contains example for FLATMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile FLATMM, but creates the placeholders for the future support on different FLATMM pipeline and different FLATMM modules. In the near future, we will gradually migrate all the FLATMM features from old CK to CK Tile.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# The basic pipeline method on the flatmm calculation
+make tile_example_flatmm_basic -j
+```
+This will result in an executable `build/bin/tile_example_flatmm_basic`
+
+## example
+```
+args:
+          -b    batch size (default:1)
+          -m    m dimension (default:1024)
+          -n    n dimension (default:2048)
+          -k    k dimension (default:64)
+   -a_layout    Tensor A data layout (default: R)
+   -b_layout    Tensor B data layout (default: R)
+   -c_layout    Tensor C data layout (default: R)
+   -stride_a    Tensor A stride (default:0)
+   -stride_b    Tensor B stride (default:0)
+   -stride_c    Tensor C stride (default:0)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+          -e    Absolute error tolerance (default:1e-5)
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+     -warmup    number of iterations before benchmark the kernel (default:10)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+```
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "flatmm_basic.hpp"
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s)
+{
+    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr bool kTilePermute = false;
+    // The rank and permutation will also be generate out by the CodeGen part.
+    constexpr ck_tile::index_t kOutputRank = 2;
+
+    constexpr int kBlockPerCu = 1;
+
+    // This part comes from the Codegen
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 128;
+    constexpr ck_tile::index_t K_Tile = 32;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    // Whether doing the CShuffle (transpose before the global memory), depending on the output
+    // layout.
+    constexpr bool CShuffleEpilogue =
+        std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>;
+
+    using CodegenGemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::GemmTile2DPartitioner<CodegenGemmShape>;
+
+    using CodegenGemmTraits =
+        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using CodegenPipelineProblem = ck_tile::
+        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
+    using CodegenFlatmmPolicy = ck_tile::UniversalFlatmmPipelineAgBgCrPolicy;
+
+    using GemmEpilogue = ck_tile::CShuffleEpilogue<
+        ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                         CDataType,
+                                         CLayout,
+                                         CodegenPipelineProblem::kBlockSize,
+                                         TilePartitioner::MPerBlock,
+                                         TilePartitioner::NPerBlock,
+                                         M_Warp,
+                                         N_Warp,
+                                         M_Warp_Tile,
+                                         N_Warp_Tile,
+                                         K_Warp_Tile,
+                                         CodegenPipelineProblem::TransposeC>>;
+
+    using CodegenFlatmmPipeline =
+        ck_tile::FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenFlatmmPolicy>;
+
+    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+    using Kernel = ck_tile::FlatmmKernel<TilePartitioner, CodegenFlatmmPipeline, GemmEpilogue>;
+
+    auto kargs = Kernel::MakeKernelArgs(args);
+
+    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+    constexpr dim3 blocks = Kernel::BlockSize();
+
+#if FEIFEI_DEBUG
+    printf("[FEIFEI] --- flatmm_calc() ---\n");
+    printf("[FEIFEI] BlockPerCu = %d\n", static_cast<int>(kBlockPerCu));
+    printf("[FEIFEI] BlockTile M = %d\n", static_cast<int>(M_Tile));
+    printf("[FEIFEI] BlockTile N = %d\n", static_cast<int>(N_Tile));
+    printf("[FEIFEI] BlockTile K = %d\n", static_cast<int>(K_Tile));
+    printf("[FEIFEI] WavePerBlock M = %d\n", static_cast<int>(M_Warp));
+    printf("[FEIFEI] WavePerBlock N = %d\n", static_cast<int>(N_Warp));
+    printf("[FEIFEI] WavePerBlock K = %d\n", static_cast<int>(K_Warp));
+    printf("[FEIFEI] WaveTile M = %d\n", static_cast<int>(M_Warp_Tile));
+    printf("[FEIFEI] WaveTile N = %d\n", static_cast<int>(N_Warp_Tile));
+    printf("[FEIFEI] WaveTile K = %d\n", static_cast<int>(K_Warp_Tile));
+    printf("[FEIFEI] grids = [%d, %d, %d]\n", grids.x, grids.y, grids.z);
+    printf("[FEIFEI] blocks = [%d, %d, %d]\n", blocks.x, blocks.y, blocks.z);
+#endif
+
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Launching kernel with args:"
+                  << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                  << std::endl;
+    }
+
+    float ave_time = ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+    return ave_time;
+}
+
+#include "run_flatmm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_flatmm_example(argc, argv); }
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/flatmm.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE 1
+#define CK_TILE_PIPELINE_MEMORY 2
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE
+#endif
+
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#else
+#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+#endif
+
+template <typename DataType>
+struct GemmBasicTypeConfig;
+
+template <>
+struct GemmBasicTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+using Types = GemmBasicTypeConfig<ck_tile::half_t>;
+
+// Specific type aliases for easy access
+using ADataType   = Types::ADataType;
+using BDataType   = Types::BDataType;
+using AccDataType = Types::AccDataType;
+using CDataType   = Types::CDataType;
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser
+        .insert("m", "128", "m dimension") // 128, 3840
+        .insert("n", "128", "n dimension") // 128, 4096
+        .insert("k", "64", "k dimension")  // 64,  2048
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "R", "B tensor data layout - Row by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s);
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+// mfma_type, 0:32x32, 1:16x16
+template <typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t, std::string mfma_dtype, int mfma_type = 0)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_ = t.get_lengths()[0];
+    int k_ = t.get_lengths()[1];
+    printf("[FF] shuffle_b: mfma_dtype = %s, mfma_type = %d, n_ = %d, k_ = %d\n",
+           mfma_dtype.c_str(),
+           mfma_type,
+           n_,
+           k_);
+    if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 0)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 16, 2, 8});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "bf16" || mfma_dtype == "fp16") && mfma_type == 1)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 32, 4, 8});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 0)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 32, 32, k_ / 32, 2, 16});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    else if((mfma_dtype == "int8" || mfma_dtype == "fp8") && mfma_type == 1)
+    {
+        ck_tile::HostTensor<T> t_view({n_ / 16, 16, k_ / 64, 4, 16});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
+    return t;
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+float invoke_flatmm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                    ck_tile::DeviceMem& b_shuffle_dev_buf,
+                    ck_tile::DeviceMem& c_m_n_dev_buf,
+                    ck_tile::index_t M,
+                    ck_tile::index_t N,
+                    ck_tile::index_t K,
+                    ck_tile::index_t stride_A,
+                    ck_tile::index_t stride_B,
+                    ck_tile::index_t stride_C,
+                    ck_tile::index_t kbatch,
+                    int n_warmup,
+                    int n_repeat
+#if FEIFEI_DEBUG
+                    ,
+                    ck_tile::DeviceMem& b_k_n_dev_buf,
+                    ck_tile::DeviceMem& dbg_int_buf,
+                    ck_tile::DeviceMem& dbg_fp32_buf,
+                    ck_tile::DeviceMem& dbg_f168_buf
+#endif
+)
+{
+    ck_tile::FlatmmHostArgs args;
+    args.a_ptr         = a_m_k_dev_buf.GetDeviceBuffer();
+    args.b_shuffle_ptr = b_shuffle_dev_buf.GetDeviceBuffer();
+    args.c_ptr         = c_m_n_dev_buf.GetDeviceBuffer();
+
+    args.k_batch  = kbatch;
+    args.M        = M;
+    args.N        = N;
+    args.K        = K;
+    args.stride_A = stride_A;
+    args.stride_B = stride_B;
+    args.stride_C = stride_C;
+
+#if FEIFEI_DEBUG
+    args.b_ptr        = b_k_n_dev_buf.GetDeviceBuffer();
+    args.dbg_int_ptr  = dbg_int_buf.GetDeviceBuffer();
+    args.dbg_fp32_ptr = dbg_fp32_buf.GetDeviceBuffer();
+    args.dbg_f168_ptr = dbg_f168_buf.GetDeviceBuffer();
+
+    printf("[FEIFEI] --- invoke_flatmm: ---\n");
+    printf("[FEIFEI] args.M = %d\n", static_cast<int>(args.M));
+    printf("[FEIFEI] args.N = %d\n", static_cast<int>(args.N));
+    printf("[FEIFEI] args.K = %d\n", static_cast<int>(args.K));
+    printf("[FEIFEI] args.stride_A = %d\n", static_cast<int>(args.stride_A));
+    printf("[FEIFEI] args.stride_B = %d\n", static_cast<int>(args.stride_B));
+    printf("[FEIFEI] args.stride_C = %d\n", static_cast<int>(args.stride_C));
+    printf("[FEIFEI] args.k_batch = %d\n", static_cast<int>(args.k_batch));
+#endif
+
+    float ave_time = flatmm_calc<ALayout, BLayout, CLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Flatmm kernel with M =" << M << " N =" << N << " K =" << K
+              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
+              << " : " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <typename ALayout, typename BLayout, typename CLayout>
+int run_flatmm_example_with_layouts(int argc,
+                                    char* argv[],
+                                    const ALayout a_layout                  = ALayout{},
+                                    const BLayout b_layout                  = BLayout{},
+                                    [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
+    int n_warmup            = arg_parser.get_int("warmup");
+    int n_repeat            = arg_parser.get_int("repeat");
+#if FEIFEI_DEBUG
+    n_warmup = 1;
+    n_repeat = 2;
+#endif
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    // TODO: add different init types
+    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+#if FEIFEI_DEBUG
+    ck_tile::HostTensor<int> dbg_int({M * N * 64});
+    ck_tile::HostTensor<float> dbg_fp32({M * N * 64});
+    ck_tile::HostTensor<ADataType> dbg_f168({M * N * 64});
+
+    ck_tile::DeviceMem dbg_int_buf(dbg_int.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dbg_fp32_buf(dbg_fp32.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem dbg_f168_buf(dbg_f168.get_element_space_size_in_bytes());
+#endif
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    b_k_n_dev_buf.ToDevice(b_k_n.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    // do pre-shuffle
+    std::string mfma                              = arg_parser.get_str("prec");
+    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b(b_k_n, mfma, 1);
+    ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
+    b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
+
+    invoke_flatmm<ALayout, BLayout, CLayout>(a_m_k_dev_buf,
+                                             b_shuffle_dev_buf,
+                                             c_m_n_dev_buf,
+                                             M,
+                                             N,
+                                             K,
+                                             stride_A,
+                                             stride_B,
+                                             stride_C,
+                                             kbatch,
+                                             n_warmup,
+                                             n_repeat
+#if FEIFEI_DEBUG
+                                             ,
+                                             b_k_n_dev_buf,
+                                             dbg_int_buf,
+                                             dbg_fp32_buf,
+                                             dbg_f168_buf
+#endif
+    );
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        pass                 = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+#if FEIFEI_DEBUG
+        // c_ref
+        {
+            std::ofstream file("ff_c_cpu_ref.txt");
+            int X = static_cast<int>(N);
+            int Y = static_cast<int>(M);
+            file << " [c_cpu_ref]: Row = " << Y << ", Col = " << X << std::endl;
+
+            for(int y = 0; y < Y; y++)
+            {
+                file << "\n ========== row : [" << y << " / " << Y << "] ==========";
+                for(int x = 0; x < X; x++)
+                {
+                    if(x % 64 == 0)
+                    {
+                        file << "\n [" << x << " : " << x + 63 << "]: ";
+                    }
+                    int idx = X * y + x;
+                    file << ck_tile::type_convert<float>(c_m_n_host_ref.mData[idx]) << ", ";
+                }
+            }
+
+            file.close();
+        }
+#endif
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+        c_m_n_gpu_ref.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A;
+        BDataType* d_B;
+        CDataType* d_C;
+
+        ck_tile::hip_check_error(hipMalloc(&d_A, M * K * sizeof(ADataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_B, N * K * sizeof(BDataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_C, M * N * sizeof(CDataType)));
+
+        ck_tile::hip_check_error(hipMemcpy(d_A,
+                                           a_m_k_dev_buf.GetDeviceBuffer(),
+                                           M * K * sizeof(ADataType),
+                                           hipMemcpyHostToDevice));
+        ck_tile::hip_check_error(hipMemcpy(d_B,
+                                           b_k_n_dev_buf.GetDeviceBuffer(),
+                                           N * K * sizeof(BDataType),
+                                           hipMemcpyHostToDevice));
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
+                                           d_C,
+                                           M * N * sizeof(CDataType),
+                                           hipMemcpyDeviceToHost));
+
+        ck_tile::hip_check_error(hipFree(d_A));
+        ck_tile::hip_check_error(hipFree(d_B));
+        ck_tile::hip_check_error(hipFree(d_C));
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol(K, kbatch, max_accumulated_value);
+        pass                 = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_gpu_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+#if FEIFEI_DEBUG
+        // c_ref
+        {
+            std::ofstream file("ff_c_gpu_ref.txt");
+            int X = static_cast<int>(N);
+            int Y = static_cast<int>(M);
+            file << " [c_gpu_ref]: Row = " << Y << ", Col = " << X << std::endl;
+
+            for(int y = 0; y < Y; y++)
+            {
+                file << "\n ========== row : [" << y << " / " << Y << "] ==========";
+                for(int x = 0; x < X; x++)
+                {
+                    if(x % 64 == 0)
+                    {
+                        file << "\n [" << x << " : " << x + 63 << "]: ";
+                    }
+                    int idx = X * y + x;
+                    file << ck_tile::type_convert<float>(c_m_n_gpu_ref.mData[idx]) << ", ";
+                }
+            }
+
+            file.close();
+        }
+#endif
+    }
+
+#if FEIFEI_DEBUG
+    int GridDimX  = 1;
+    int GridDimY  = 1;
+    int BlockDimX = 64;
+    int BlockDimY = 4;
+    int DbgCnt    = 64;
+    int BlockSize = BlockDimX * BlockDimY;
+    // a_host
+    {
+        std::ofstream file("ff_a_host.txt");
+        int X = static_cast<int>(K);
+        int Y = static_cast<int>(M);
+        file << " [a_host]: Row = " << Y << ", Col = " << X << std::endl;
+
+        for(int y = 0; y < Y; y++)
+        {
+            file << "\n ========== row : [" << y << " / " << Y << "] ==========";
+            for(int x = 0; x < X; x++)
+            {
+                int idx = X * y + x;
+                if(idx % 16 == 0)
+                {
+                    file << "\n [" << x << " : " << x + 15 << " ]: ";
+                }
+
+                file << ck_tile::type_convert<float>(a_m_k.mData[idx]) << ", ";
+            }
+        }
+
+        file.close();
+    }
+    // b_host
+    {
+        std::ofstream file("ff_b_host.txt");
+        int X = static_cast<int>(K);
+        int Y = static_cast<int>(N);
+        file << " [b_host]: Row = " << Y << ", Col = " << X << std::endl;
+
+        for(int y = 0; y < Y; y++)
+        {
+            file << "\n ========== row : [" << y << " / " << Y << "] ==========";
+            for(int x = 0; x < X; x++)
+            {
+                int idx = X * y + x;
+                if(idx % 16 == 0)
+                {
+                    file << "\n [" << x << " : " << x + 15 << " ]: ";
+                }
+
+                file << ck_tile::type_convert<float>(b_k_n.mData[idx]) << ", ";
+            }
+        }
+
+        file.close();
+    }
+    // b_shuffle
+    {
+        std::ofstream file("ff_b_shuffle_host.txt");
+        int X = static_cast<int>(K);
+        int Y = static_cast<int>(N);
+        file << " [b_shuffle_host]: Row = " << Y << ", Col = " << X << std::endl;
+
+        for(int y = 0; y < Y; y++)
+        {
+            file << "\n ========== row : [" << y << " / " << Y << "] ==========";
+            for(int x = 0; x < X; x++)
+            {
+                int idx = X * y + x;
+                if(idx % 16 == 0)
+                {
+                    file << "\n [" << x << " : " << x + 15 << " ]: ";
+                }
+
+                file << ck_tile::type_convert<float>(b_shuffle_host.mData[idx]) << ", ";
+            }
+        }
+
+        file.close();
+    }
+    // c_dev ---> kernel
+    {
+        auto c_dev = c_m_n_dev_buf.ToHost<CDataType>();
+        std::ofstream file("ff_c_dev_kernel.txt");
+        file << " [c_dev]: Grid = [" << GridDimX << ", " << GridDimY << "], Block = " << BlockSize
+             << std::endl;
+
+        for(int bidy = 0; bidy < GridDimY; bidy++)
+        {
+            for(int bidx = 0; bidx < GridDimX; bidx++)
+            {
+                file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
+                for(int tid = 0; tid < BlockSize; tid++)
+                {
+                    int gid = (BlockSize * GridDimX) * bidy + BlockSize * bidx + tid;
+
+                    file << "\n [" << tid << "]: ";
+                    for(int i = 0; i < DbgCnt; i++) // multi output per thread
+                        file << ck_tile::type_convert<float>(c_dev.mData[gid * DbgCnt + i]) << ", ";
+                }
+            }
+        }
+
+        file.close();
+    }
+    // c_dev
+    {
+        // auto d_dev = d_buf.ToHost<float>();
+        auto c_dev = c_m_n_dev_buf.ToHost<CDataType>();
+        std::ofstream file("ff_c_dev.txt");
+        int X = static_cast<int>(N);
+        int Y = static_cast<int>(M);
+        file << " [c_dev]: Row = " << Y << ", Col = " << X << std::endl;
+
+        for(int y = 0; y < Y; y++)
+        {
+            file << "\n ========== row : [" << y << " / " << Y << "] ==========";
+            for(int x = 0; x < X; x++)
+            {
+                if(x % 64 == 0)
+                {
+                    file << "\n [" << x << " : " << x + 63 << "]: ";
+                }
+                int idx = X * y + x;
+                file << ck_tile::type_convert<float>(c_dev.mData[idx]) << ", ";
+            }
+        }
+
+        file.close();
+    }
+    // dbg_int ---> kernel
+    {
+        auto dbg_int_dev = dbg_int_buf.ToHost<int>();
+        std::ofstream file("ff_dbg_int_kernel.txt");
+        file << " [dbg_int]: Grid = [" << GridDimX << ", " << GridDimY << "], Block = " << BlockSize
+             << std::endl;
+
+        for(int bidy = 0; bidy < GridDimY; bidy++)
+        {
+            for(int bidx = 0; bidx < GridDimX; bidx++)
+            {
+                file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
+                for(int tid = 0; tid < BlockSize; tid++)
+                {
+                    int gid = (BlockSize * GridDimX) * bidy + BlockSize * bidx + tid;
+
+                    file << "\n [" << tid << "]: ";
+                    for(int i = 0; i < DbgCnt; i++)
+                        file << ck_tile::type_convert<int>(dbg_int_dev.mData[gid * DbgCnt + i])
+                             << ", ";
+                }
+            }
+        }
+
+        file.close();
+    }
+    // dbg_int
+    {
+        auto dbg_int_dev = dbg_int_buf.ToHost<int>();
+        std::ofstream file("ff_dbg_int.txt");
+        int X = static_cast<int>(N);
+        int Y = static_cast<int>(M);
+        file << " [dbg_int]: Row = " << Y << ", Col = " << X << std::endl;
+
+        for(int m = 0; m < Y; m++)
+        {
+            file << "\n ========== row : [" << m << " / " << Y << "] ==========";
+            for(int n = 0; n < X; n++)
+            {
+                if(n % 64 == 0)
+                {
+                    file << "\n [" << n << " : " << n + 63 << "]: ";
+                }
+                int idx = X * m + n;
+                file << ck_tile::type_convert<int>(dbg_int_dev.mData[idx]) << ", ";
+            }
+        }
+
+        file.close();
+    }
+    // dbg_fp32 ---> kernel
+    {
+        auto dbg_fp32_dev = dbg_fp32_buf.ToHost<float>();
+        std::ofstream file("ff_dbg_fp32_kernel.txt");
+        file << " [dbg_fp32]: Grid = [" << GridDimX << ", " << GridDimY
+             << "], Block = " << BlockSize << std::endl;
+
+        for(int bidy = 0; bidy < GridDimY; bidy++)
+        {
+            for(int bidx = 0; bidx < GridDimX; bidx++)
+            {
+                file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
+                for(int tid = 0; tid < BlockSize; tid++)
+                {
+                    int gid = (BlockSize * GridDimX) * bidy + BlockSize * bidx + tid;
+
+                    file << "\n [" << tid << "]: ";
+                    for(int i = 0; i < DbgCnt; i++)
+                        file << ck_tile::type_convert<float>(dbg_fp32_dev.mData[gid * DbgCnt + i])
+                             << ", ";
+                }
+            }
+        }
+
+        file.close();
+    }
+    // dbg_fp32
+    {
+        auto dbg_fp32_dev = dbg_fp32_buf.ToHost<float>();
+        std::ofstream file("ff_dbg_fp32.txt");
+        int X = static_cast<int>(N);
+        int Y = static_cast<int>(M);
+        file << " [dbg_fp32]: Row = " << Y << ", Col = " << X << std::endl;
+
+        for(int m = 0; m < Y; m++)
+        {
+            file << "\n ========== row : [" << m << " / " << Y << "] ==========";
+            for(int n = 0; n < X; n++)
+            {
+                if(n % 64 == 0)
+                {
+                    file << "\n [" << n << " : " << n + 63 << "]: ";
+                }
+                int idx = X * m + n;
+                file << ck_tile::type_convert<float>(dbg_fp32_dev.mData[idx]) << ", ";
+            }
+        }
+
+        file.close();
+    }
+    // dbg_fp16 ---> kernel
+    {
+        auto dbg_fp16_dev = dbg_f168_buf.ToHost<ck_tile::half_t>();
+        std::ofstream file("ff_dbg_fp16_kernel.txt");
+        file << " [dbg_fp16]: Grid = [" << GridDimX << ", " << GridDimY
+             << "], Block = " << BlockSize << std::endl;
+
+        for(int bidy = 0; bidy < GridDimY; bidy++)
+        {
+            for(int bidx = 0; bidx < GridDimX; bidx++)
+            {
+                file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
+                for(int tid = 0; tid < BlockSize; tid++)
+                {
+                    int gid = (BlockSize * GridDimX) * bidy + BlockSize * bidx + tid;
+
+                    file << "\n [" << tid << "]: ";
+                    for(int i = 0; i < DbgCnt; i++)
+                        file << ck_tile::type_convert<float>(dbg_fp16_dev.mData[gid * DbgCnt + i])
+                             << ", ";
+                }
+            }
+        }
+
+        file.close();
+    }
+    // dbg_fp16
+    {
+        auto dbg_fp16_dev = dbg_f168_buf.ToHost<ck_tile::half_t>();
+        std::ofstream file("ff_dbg_fp16.txt");
+        int X = static_cast<int>(N);
+        int Y = static_cast<int>(M);
+        file << " [dbg_fp16]: Row = " << Y << ", Col = " << X << std::endl;
+
+        for(int m = 0; m < Y; m++)
+        {
+            file << "\n ========== row : [" << m << " / " << Y << "] ==========";
+            for(int n = 0; n < X; n++)
+            {
+                if(n % 64 == 0)
+                {
+                    file << "\n [" << n << " : " << n + 63 << "]: ";
+                }
+                int idx = X * m + n;
+                file << ck_tile::type_convert<float>(dbg_fp16_dev.mData[idx]) << ", ";
+            }
+        }
+
+        file.close();
+    }
+    // dbg_fp8 ---> kernel
+    {
+        auto dbg_fp8_dev = dbg_f168_buf.ToHost<ck_tile::fp8_t>();
+        std::ofstream file("ff_dbg_fp8_kernel.txt");
+        file << " [dbg_fp8]: Grid = [" << GridDimX << ", " << GridDimY << "], Block = " << BlockSize
+             << std::endl;
+
+        for(int bidy = 0; bidy < GridDimY; bidy++)
+        {
+            for(int bidx = 0; bidx < GridDimX; bidx++)
+            {
+                file << "\n ========== block : [" << bidx << ", " << bidy << "] ==========";
+                for(int tid = 0; tid < BlockSize; tid++)
+                {
+                    int gid = (BlockSize * GridDimX) * bidy + BlockSize * bidx + tid;
+
+                    file << "\n [" << tid << "]: ";
+                    for(int i = 0; i < DbgCnt; i++)
+                        file << ck_tile::type_convert<float>(dbg_fp8_dev.mData[gid * DbgCnt + i])
+                             << ", ";
+                }
+            }
+        }
+
+        file.close();
+    }
+    // dbg_fp8
+    {
+        auto dbg_fp8_dev = dbg_f168_buf.ToHost<ck_tile::fp8_t>();
+        std::ofstream file("ff_dbg_fp8.txt");
+        int X = static_cast<int>(N);
+        int Y = static_cast<int>(M);
+        file << " [dbg_fp8]: Row = " << Y << ", Col = " << X << std::endl;
+
+        for(int m = 0; m < Y; m++)
+        {
+            file << "\n ========== row : [" << m << " / " << Y << "] ==========";
+            for(int n = 0; n < X; n++)
+            {
+                if(n % 64 == 0)
+                {
+                    file << "\n [" << n << " : " << n + 63 << "]: ";
+                }
+                int idx = X * m + n;
+                file << ck_tile::type_convert<float>(dbg_fp8_dev.mData[idx]) << ", ";
+            }
+        }
+
+        file.close();
+    }
+#endif
+
+    return pass;
+}
+
+int run_flatmm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    std::string a_layout = arg_parser.get_str("a_layout");
+    std::string b_layout = arg_parser.get_str("b_layout");
+
+    if(a_layout == "R" && b_layout == "R")
+    {
+        return run_flatmm_example_with_layouts(argc, argv, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "C")
+    {
+        return run_flatmm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+    }
+    // TODO: Fixme: with latest changes to GemmPipelineAGmemBGmemCRegV1DefaultPolicy below do not
+    // work.
+    // else if(a_layout == "C" && b_layout == "C")
+    // {
+    //     return run_flatmm_example_with_layouts(argc, argv, Col{}, Col{}, Row{});
+    // }
+    // else if(a_layout == "C" && b_layout == "R")
+    // {
+    //     return run_flatmm_example_with_layouts(argc, argv, Col{}, Row{}, Row{});
+    // }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -17,4 +17,5 @@ add_subdirectory(14_moe_smoothquant)
 add_subdirectory(15_fused_moe)
 add_subdirectory(16_batched_gemm)
 add_subdirectory(17_grouped_gemm)
+add_subdirectory(18_flatmm)
 add_subdirectory(35_batched_transpose)
--- a/include/ck_tile/ops/flatmm.hpp
+++ b/include/ck_tile/ops/flatmm.hpp
@@ -3,6 +3,53 @@

 #pragma once

+#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
+#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
+#include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
+// #include "ck_tile/ops/gemm/kernel/gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+// #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
+// #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp"
+// #include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
+#include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
+#include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
+
+#include "ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_universal_pipeline_ag_bg_cr_policy.hpp"
+
 #include "ck_tile/ops/flatmm/block/flatmm_32x512x128_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32.hpp"
 #include "ck_tile/ops/flatmm/block/flatmm_sn_32x128x512_1x4x1_16x16x32_itl.hpp"

--- a/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/flatmm_kernel.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+
+namespace ck_tile {
+
+struct FlatmmProblem
+{
+    CK_TILE_HOST FlatmmProblem() = default;
+    CK_TILE_HOST FlatmmProblem(
+        index_t M_, index_t N_, index_t K_, index_t stride_A_, index_t stride_B_, index_t stride_C_)
+        : M(M_), N(N_), K(K_), stride_A(stride_A_), stride_B(stride_B_), stride_C(stride_C_)
+    {
+    }
+
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+};
+
+struct FlatmmHostArgs : public FlatmmProblem
+{
+    CK_TILE_HOST FlatmmHostArgs() = default;
+    CK_TILE_HOST FlatmmHostArgs(const void* a_ptr_,
+                                const void* b_shuffle_ptr_,
+                                void* c_ptr_,
+                                index_t k_batch_,
+                                index_t M_,
+                                index_t N_,
+                                index_t K_,
+                                index_t stride_A_,
+                                index_t stride_B_,
+                                index_t stride_C_)
+        : FlatmmProblem(M_, N_, K_, stride_A_, stride_B_, stride_C_),
+          a_ptr(a_ptr_),
+          b_shuffle_ptr(b_shuffle_ptr_),
+          c_ptr(c_ptr_),
+          k_batch(k_batch_)
+    {
+    }
+
+    const void* a_ptr;
+    const void* b_shuffle_ptr;
+    void* c_ptr;
+    index_t k_batch;
+
+#if FEIFEI_DEBUG
+    const void* b_ptr;
+
+    void* dbg_int_ptr;
+    void* dbg_fp32_ptr;
+    void* dbg_f168_ptr;
+#endif
+};
+
+template <typename TilePartitioner_, typename FlatmmPipeline_, typename EpiloguePipeline_>
+struct FlatmmKernel
+{
+    using TilePartitioner                    = remove_cvref_t<TilePartitioner_>;
+    using FlatmmPipeline                     = remove_cvref_t<FlatmmPipeline_>;
+    using EpiloguePipeline                   = remove_cvref_t<EpiloguePipeline_>;
+    using ALayout                            = remove_cvref_t<typename FlatmmPipeline::ALayout>;
+    using BLayout                            = remove_cvref_t<typename FlatmmPipeline::BLayout>;
+    using CLayout                            = remove_cvref_t<typename FlatmmPipeline::CLayout>;
+    static constexpr index_t KernelBlockSize = FlatmmPipeline::BlockSize;
+
+    using ADataType = remove_cvref_t<typename FlatmmPipeline::ADataType>;
+    using BDataType = remove_cvref_t<typename FlatmmPipeline::BDataType>;
+    using CDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
+
+    static constexpr auto I0 = number<0>();
+    static constexpr auto I1 = number<1>();
+    static constexpr auto I2 = number<2>();
+
+    CK_TILE_HOST static constexpr auto GridSize(index_t M, index_t N, index_t KBatch)
+    {
+        return TilePartitioner::GridSize(M, N);
+        // return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
+    }
+
+    __host__ static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
+
+    struct FlatmmKernelArgs
+    {
+        const void* a_ptr;
+        const void* b_shuffle_ptr;
+        void* c_ptr;
+        index_t M;
+        index_t N;
+        index_t K;
+        index_t stride_A;
+        index_t stride_B;
+        index_t stride_C;
+        index_t KBatch;
+#if FEIFEI_DEBUG
+        const void* b_ptr;
+
+        void* dbg_int_ptr;
+        void* dbg_fp32_ptr;
+        void* dbg_f168_ptr;
+#endif
+    };
+
+    CK_TILE_HOST static constexpr FlatmmKernelArgs MakeKernelArgs(const FlatmmHostArgs& hostArgs)
+    {
+        return FlatmmKernelArgs{hostArgs.a_ptr,
+                                hostArgs.b_shuffle_ptr,
+                                hostArgs.c_ptr,
+                                hostArgs.M,
+                                hostArgs.N,
+                                hostArgs.K,
+                                hostArgs.stride_A,
+                                hostArgs.stride_B,
+                                hostArgs.stride_C,
+                                hostArgs.k_batch
+#if FEIFEI_DEBUG
+                                ,
+                                hostArgs.b_ptr,
+                                hostArgs.dbg_int_ptr,
+                                hostArgs.dbg_fp32_ptr,
+                                hostArgs.dbg_f168_ptr
+#endif
+        };
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return max(FlatmmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
+    }
+
+    struct SplitKBatchOffset
+    {
+        __device__ SplitKBatchOffset(const FlatmmKernelArgs& kargs,
+                                     const std::size_t k_id = blockIdx.z)
+        {
+            constexpr auto K1   = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t K_t   = kargs.KBatch * K1;
+            const index_t KRead = (kargs.K + K_t - 1) / K_t * K1;
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * KRead;
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                a_k_split_offset = k_id * KRead * kargs.stride_A;
+            }
+
+            if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * KRead * kargs.stride_B;
+            }
+            else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BLayout>)
+            {
+                b_k_split_offset = k_id * KRead;
+            }
+
+            if(k_id < static_cast<uint32_t>(kargs.KBatch - 1))
+            {
+                splitted_k = KRead;
+            }
+            else
+            {
+                splitted_k = kargs.K - KRead * (kargs.KBatch - 1);
+            }
+        }
+
+        index_t a_k_split_offset;
+        index_t b_k_split_offset;
+        index_t splitted_k;
+    };
+
+    CK_TILE_HOST static bool IsSupportedArgument(const FlatmmKernelArgs& kargs)
+    {
+        if constexpr(EpiloguePipeline::template GetVectorSizeC<CDataType>() % 2 != 0 &&
+                     is_any_of<CDataType, fp16_t, bf16_t>::value)
+        {
+            if(kargs.k_batch != 1)
+            {
+                std::cerr << "Conditions not met for Kbatch >1 !" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.K % TilePartitioner::KPerBlock != 0 && FlatmmPipeline::kPadK == false)
+            {
+                std::cerr << "Can't support K that is not a multiple of KPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.K % FlatmmPipeline::GetVectorSizeA() != 0)
+            {
+                std::cerr << "K is not a multiple of vector load size for A tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && FlatmmPipeline::kPadM == false)
+            {
+                std::cerr << "Can't support M that is not a multiple of MPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.M % FlatmmPipeline::GetVectorSizeA() != 0)
+            {
+                std::cerr << "M is not a multiple of vector load size for A tensor!" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
+            {
+                std::cerr << "Can't support N that is not a multiple of NPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.N % FlatmmPipeline::GetVectorSizeB() != 0)
+            {
+                std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.K % TilePartitioner::KPerBlock != 0 && FlatmmPipeline::kPadK == false)
+            {
+                std::cerr << "Can't support K that is not a multiple of KPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.K % FlatmmPipeline::GetVectorSizeB() != 0)
+            {
+                std::cerr << "K is not a multiple of vector load size for B tensor!" << std::endl;
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+        {
+            if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
+            {
+                std::cerr << "Can't support N that is not a multiple of NPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.N % EpiloguePipeline::template GetVectorSizeC<CDataType>() != 0)
+            {
+                std::cerr << "N is not a multiple of vector load size for C tensor!" << std::endl;
+                return false;
+            }
+        }
+        else
+        {
+            if(kargs.M % TilePartitioner::MPerBlock != 0 && FlatmmPipeline::kPadM == false)
+            {
+                std::cerr << "Can't support M that is not a multiple of MPerBlock"
+                             " without padding!"
+                          << std::endl;
+                return false;
+            }
+            if(kargs.M % EpiloguePipeline::template GetVectorSizeC<CDataType>() != 0)
+            {
+                std::cerr << "M is not a multiple of vector load size for C tensor!" << std::endl;
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static auto MakeGemmTensorViews(const ADataType* a_ptr,
+                                                   const BDataType* b_ptr,
+                                                   CDataType* c_ptr,
+                                                   const FlatmmKernelArgs& kargs,
+                                                   const SplitKBatchOffset& splitk_batch_offset)
+    {
+        const auto& a_tensor_view = [&]() {
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(kargs.M, splitk_batch_offset.splitted_k),
+                    make_tuple(kargs.stride_A, 1),
+                    number<FlatmmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    a_ptr,
+                    make_tuple(splitk_batch_offset.splitted_k, kargs.M),
+                    make_tuple(kargs.stride_A, 1),
+                    number<FlatmmPipeline::GetVectorSizeA()>{},
+                    number<1>{});
+            }
+        }();
+
+        const auto& b_tensor_view = [&]() {
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    b_ptr,
+                    make_tuple(splitk_batch_offset.splitted_k, kargs.N),
+                    make_tuple(kargs.stride_B, 1),
+                    number<FlatmmPipeline::GetVectorSizeB()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global>(
+                    b_ptr,
+                    make_tuple(kargs.N, splitk_batch_offset.splitted_k),
+                    make_tuple(kargs.stride_B, 1),
+                    number<FlatmmPipeline::GetVectorSizeB()>{},
+                    number<1>{});
+            }
+        }();
+
+        // TODO: enable vector write for C in ColMajor
+        const auto& c_tensor_view = [&]() {
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(kargs.stride_C, 1),
+                    number<EpiloguePipeline::template GetVectorSizeC<CDataType>()>{},
+                    number<1>{});
+            }
+            else
+            {
+                return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
+                    c_ptr,
+                    make_tuple(kargs.M, kargs.N),
+                    make_tuple(1, kargs.stride_C),
+                    number<1>{},
+                    number<1>{});
+            }
+        }();
+
+        return make_tuple(a_tensor_view, b_tensor_view, c_tensor_view);
+    }
+
+    template <typename TensorView>
+    CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
+    {
+        const auto& a_pad_view = [&]() {
+            const auto& a_tensor_view = views.at(I0);
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(a_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<FlatmmPipeline::kPadM, false>{});
+            }
+        }();
+
+        const auto& b_pad_view = [&]() {
+            const auto& b_tensor_view = views.at(I1);
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                return pad_tensor_view(b_tensor_view,
+                                       make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadK>{});
+            }
+            else
+            {
+                return pad_tensor_view(b_tensor_view,
+                                       make_tuple(number<TilePartitioner::NPerBlock>{},
+                                                  number<TilePartitioner::KPerBlock>{}),
+                                       sequence<FlatmmPipeline::kPadN, false>{});
+            }
+        }();
+
+        const auto& c_pad_view = [&]() {
+            const auto& c_tensor_view = views.at(I2);
+            if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<false, FlatmmPipeline::kPadN>{});
+            }
+            else
+            {
+                return pad_tensor_view(c_tensor_view,
+                                       make_tuple(number<TilePartitioner::MPerBlock>{},
+                                                  number<TilePartitioner::NPerBlock>{}),
+                                       sequence<FlatmmPipeline::kPadM, false>{});
+            }
+        }();
+
+        return make_tuple(a_pad_view, b_pad_view, c_pad_view);
+    }
+
+    template <typename PadView>
+    CK_TILE_DEVICE static auto
+    MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
+    {
+        const auto& a_pad_view     = views.at(I0);
+        const auto& a_block_window = make_tile_window(
+            a_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::KPerBlock>{}),
+            {i_m, 0});
+
+        const auto& b_pad_view     = views.at(I1);
+        const auto& b_block_window = make_tile_window(
+            b_pad_view,
+            make_tuple(number<TilePartitioner::NPerBlock>{}, number<TilePartitioner::KPerBlock>{}),
+            {i_n, 0});
+
+        const auto& c_pad_view = views.at(I2);
+        auto c_block_window    = make_tile_window(
+            c_pad_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {i_m, i_n});
+
+        return make_tuple(a_block_window, b_block_window, c_block_window);
+    }
+
+    /**
+     * @brief Runs single GEMM problem cooperatively by whole workgroup.
+     *
+     * @param a_ptr input A pointer
+     * @param b_shuffle_ptr input B pointer
+     * @param c_ptr output C pointer
+     * @param kargs GEMM kernel arguments
+     * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
+     * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
+     *
+     * @tparam DstInMemOp Destination memory operation (default: set).
+     */
+    template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
+    CK_TILE_DEVICE static void RunFlatmm(const ADataType* a_ptr,
+                                         const BDataType* b_shuffle_ptr,
+                                         CDataType* c_ptr,
+                                         void* smem_ptr,
+                                         const FlatmmKernelArgs& kargs,
+                                         const SplitKBatchOffset& splitk_batch_offset,
+                                         const index_t block_idx_m,
+                                         const index_t block_idx_n
+#if FEIFEI_DEBUG
+                                         ,
+                                         const BDataType* b_ptr,
+                                         int* dbg_int,
+                                         float* dbg_fp32,
+                                         short* dbg_f168
+#endif
+    )
+    {
+
+        // Create Flatmm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple = MakeGemmTensorViews<DstInMemOp>(
+            a_ptr, b_shuffle_ptr, c_ptr, kargs, splitk_batch_offset);
+        // origin layout
+        // const auto& gemm_tensor_views_tuple =
+        //    MakeGemmTensorViews<DstInMemOp>(a_ptr, b_ptr, c_ptr, kargs, splitk_batch_offset);
+
+        const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& a_block_window = gemm_tile_windows.at(I0);
+        const auto& b_block_window = gemm_tile_windows.at(I1);
+        const auto& c_block_tile   = FlatmmPipeline{}.template operator()(a_block_window,
+                                                                        b_block_window,
+                                                                        num_loop,
+                                                                        smem_ptr
+#if FEIFEI_DEBUG
+                                                                        ,
+                                                                        b_ptr,
+                                                                        dbg_int,
+                                                                        dbg_fp32,
+                                                                        dbg_f168
+#endif
+        );
+
+        // feifei TODO: Un-comment bellow once pipeline() is implemented
+#if 0
+        // Run Epilogue Pipeline
+        auto& c_block_window = gemm_tile_windows.at(I2);
+
+        constexpr bool is_output_c_reg_transposed =
+            EpiloguePipeline::IsOutputTransposed() != FlatmmPipeline::IsTransposeC();
+        if constexpr((DstInMemOp == memory_operation_enum::set) || (sizeof(CDataType) > 2) ||
+                     (FlatmmPipeline::VectorSizeC % 2 == 0 &&
+                      std::is_same_v<CLayout, tensor_layout::gemm::RowMajor> &&
+                      is_output_c_reg_transposed))
+        {
+            EpiloguePipeline{}
+                .template operator()<decltype(c_block_window), decltype(c_block_tile), DstInMemOp>(
+                    c_block_window, c_block_tile);
+        }
+#endif
+    }
+
+    CK_TILE_DEVICE void operator()(FlatmmKernelArgs kargs) const
+    {
+#if FEIFEI_DEBUG
+        if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
+        {
+            printf("[KERNEL] ===== FlatmmKernel() =====\n");
+            printf("[KERNEL] blockDim: [%d, %d], gridDim: [%d, %d]\n",
+                   static_cast<int>(blockDim.x),
+                   static_cast<int>(blockDim.y),
+                   static_cast<int>(gridDim.x),
+                   static_cast<int>(gridDim.y));
+            printf("[KERNEL] lds = %.3f (KB)\n", GetSmemSize() / 1024.0f);
+        }
+
+        uint32_t tidx = threadIdx.x;
+        uint32_t tidy = threadIdx.y;
+        uint32_t bidx = blockIdx.x;
+        uint32_t bidy = blockIdx.y;
+        uint32_t bdmx = blockDim.x;
+        uint32_t bdmy = blockDim.y;
+        uint32_t gdmx = gridDim.x;
+        uint32_t gdmy = gridDim.y;
+        uint32_t gid  = ((bdmx * bdmy) * gdmx) * bidy + (bdmx * bdmy) * bidx + bdmx * tidy + tidx;
+
+        const SplitKBatchOffset _splitk_batch_offset(kargs);
+        const BDataType* b_ptr =
+            static_cast<const BDataType*>(kargs.b_ptr) + _splitk_batch_offset.b_k_split_offset;
+
+        int* dbg_int    = static_cast<int*>(kargs.dbg_int_ptr);
+        float* dbg_fp32 = static_cast<float*>(kargs.dbg_fp32_ptr);
+        short* dbg_f168 = static_cast<short*>(kargs.dbg_f168_ptr);
+
+        dbg_int[gid]  = 1;
+        dbg_fp32[gid] = 1.0f;
+        dbg_f168[gid] = ck_tile::type_convert<ck_tile::half_t>(1.0f);
+#endif
+
+        const auto [iM, iN] = TilePartitioner::GetOutputTileIndex(blockIdx.x, blockIdx.y);
+        const index_t i_m   = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
+        const index_t i_n   = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
+
+        const SplitKBatchOffset splitk_batch_offset(kargs);
+        // options
+        const ADataType* a_ptr =
+            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;
+        const BDataType* b_shuffle_ptr = static_cast<const BDataType*>(kargs.b_shuffle_ptr) +
+                                         splitk_batch_offset.b_k_split_offset;
+        CDataType* c_ptr = static_cast<CDataType*>(kargs.c_ptr);
+
+        // allocate LDS
+        __shared__ char smem_ptr[GetSmemSize()];
+
+        if(kargs.KBatch == 1)
+        {
+            RunFlatmm(a_ptr,
+                      b_shuffle_ptr,
+                      c_ptr,
+                      smem_ptr,
+                      kargs,
+                      splitk_batch_offset,
+                      i_m,
+                      i_n
+#if FEIFEI_DEBUG
+                      ,
+                      b_ptr,
+                      dbg_int,
+                      dbg_fp32,
+                      dbg_f168
+#endif
+            );
+        }
+        else
+        {
+            RunFlatmm<memory_operation_enum::atomic_add>(a_ptr,
+                                                         b_shuffle_ptr,
+                                                         c_ptr,
+                                                         smem_ptr,
+                                                         kargs,
+                                                         splitk_batch_offset,
+                                                         i_m,
+                                                         i_n
+#if FEIFEI_DEBUG
+                                                         ,
+                                                         b_ptr,
+                                                         dbg_int,
+                                                         dbg_fp32,
+                                                         dbg_f168
+#endif
+            );
+        }
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
+
+namespace ck_tile {
+
+//  A Tile Window: global memory
+//  B Tile Window: global memory
+//  C Distributed tensor: register
+template <typename Problem, typename Policy = FlatmmPipelineAGmemBGmemCRegV1DefaultPolicy>
+struct FlatmmPipelineAGmemBGmemCRegV1
+{
+    using ADataType      = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
+    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+
+    using ALayout = remove_cvref_t<typename Problem::ALayout>;
+    using BLayout = remove_cvref_t<typename Problem::BLayout>;
+    using CLayout = remove_cvref_t<typename Problem::CLayout>;
+
+    using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
+
+    static constexpr index_t BlockSize = Problem::kBlockSize;
+
+    static constexpr index_t kMPerBlock = BlockGemmShape::kM;
+    static constexpr index_t kNPerBlock = BlockGemmShape::kN;
+    static constexpr index_t kKPerBlock = BlockGemmShape::kK;
+
+    static constexpr index_t GetVectorSizeA() { return Problem::VectorSizeA; }
+    static constexpr index_t GetVectorSizeB() { return Problem::VectorSizeB; }
+    static constexpr index_t GetVectorSizeC() { return Problem::VectorSizeC; }
+
+    static constexpr bool kPadM = Problem::kPadM;
+    static constexpr bool kPadN = Problem::kPadN;
+    static constexpr bool kPadK = Problem::kPadK;
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
+    {
+        return integer_divide_ceil(
+                   sizeof(ADataType) *
+                       Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
+                   16) *
+                   16 +
+               sizeof(BDataType) *
+                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        return Policy::template GetSmemSize<Problem>();
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return Policy::IsTransposeC(); }
+
+    template <typename ADramBlockWindowTmp,
+              typename BDramBlockWindowTmp,
+              typename AElementFunction,
+              typename BElementFunction>
+    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                        const AElementFunction& a_element_func,
+                                        const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                        const BElementFunction& b_element_func,
+                                        index_t num_loop,
+                                        void* p_smem
+#if FEIFEI_DEBUG
+                                        ,
+                                        const BDataType* b_ptr,
+                                        int* dbg_int,
+                                        float* dbg_fp32,
+                                        short* dbg_f168
+#endif
+    ) const
+    {
+#if FEIFEI_DEBUG
+        if(threadIdx.x == 0 && blockIdx.x == 0 && threadIdx.y == 0 && blockIdx.y == 0)
+        {
+            printf("[PIPELN] FlatmmPipelinen():\n");
+        }
+
+        uint32_t tidx = threadIdx.x;
+        uint32_t tidy = threadIdx.y;
+        uint32_t bidx = blockIdx.x;
+        uint32_t bidy = blockIdx.y;
+        uint32_t bdmx = blockDim.x;
+        uint32_t bdmy = blockDim.y;
+        uint32_t gdmx = gridDim.x;
+        uint32_t gdmy = gridDim.y;
+        uint32_t gid  = ((bdmx * bdmy) * gdmx) * bidy + (bdmx * bdmy) * bidx + bdmx * tidy + tidx;
+
+        dbg_int[gid]  = -1;
+        dbg_fp32[gid] = -1.0f;
+        dbg_f168[gid] = ck_tile::type_convert<ck_tile::half_t>(-1.0f);
+#endif
+        static_assert(
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(kMPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kNPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          kKPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+#if 1
+        // feifei TODO: Implement gemm here
+        return nullptr;
+#else
+        // A tile in LDS
+        ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
+
+        constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+        constexpr index_t a_lds_block_space_size_aligned =
+            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
+            16;
+
+        // B tile in LDS
+        BDataType* p_b_lds = static_cast<BDataType*>(
+            static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
+
+        constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
+
+        auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeADramTileDistribution<Problem>());
+
+        // A LDS tile window for store
+        auto a_copy_lds_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // B DRAM tile window for load
+        auto b_copy_dram_window =
+            make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}),
+                             b_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeBDramTileDistribution<Problem>());
+
+        // B LDS tile window for store
+        auto b_copy_lds_window = make_tile_window(
+            b_lds_block, make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // A LDS tile for block GEMM
+        auto a_lds_gemm_window = make_tile_window(
+            a_lds_block, make_tuple(number<kMPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // B LDS tile for block GEMM
+        auto b_lds_gemm_window = make_tile_window(
+            b_lds_block, make_tuple(number<kNPerBlock>{}, number<kKPerBlock>{}), {0, 0});
+
+        // Block GEMM
+        auto block_gemm = BlockGemm();
+
+        // Acc register tile
+        auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){};
+
+        // return c_block_tile;
+
+        // prefetch
+        // global read 0
+        auto a_block_tile = load_tile(a_copy_dram_window);
+        auto b_block_tile = load_tile(b_copy_dram_window);
+
+        {
+            // move to 1
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+            move_tile_window(b_copy_dram_window, {0, kKPerBlock});
+
+            // initialize C
+            tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+            // LDS write 0
+            if constexpr(std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>)
+            {
+                auto a_shuffle_tmp = make_static_distributed_tensor<ADataType>(
+                    Policy::template MakeShuffledARegBlockDescriptor<Problem>());
+                shuffle_tile(a_shuffle_tmp, a_block_tile);
+                const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_shuffle_tmp);
+                store_tile(a_copy_lds_window, a_block_tile_tmp);
+            }
+            else
+            {
+                store_tile(a_copy_lds_window, tile_elementwise_in(a_element_func, a_block_tile));
+            }
+
+            // LDS write 0
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                auto b_shuffle_tmp = make_static_distributed_tensor<BDataType>(
+                    Policy::template MakeShuffledBRegBlockDescriptor<Problem>());
+                shuffle_tile(b_shuffle_tmp, b_block_tile);
+                const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_shuffle_tmp);
+                store_tile(b_copy_lds_window, b_block_tile_tmp);
+            }
+            else
+            {
+                store_tile(b_copy_lds_window, tile_elementwise_in(b_element_func, b_block_tile));
+            }
+        }
+
+        index_t iCounter = num_loop - 1;
+        while(iCounter > 0)
+        {
+            // global read i + 1
+            a_block_tile = load_tile(a_copy_dram_window);
+            b_block_tile = load_tile(b_copy_dram_window);
+
+            block_sync_lds();
+
+            // GEMM i
+            block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+
+            block_sync_lds();
+
+            // move to i + 2
+            move_tile_window(a_copy_dram_window, {0, kKPerBlock});
+            move_tile_window(b_copy_dram_window, {0, kKPerBlock});
+
+            // LDS write i + 1
+            const auto a_block_tile_tmp = tile_elementwise_in(a_element_func, a_block_tile);
+            store_tile(a_copy_lds_window, a_block_tile_tmp);
+
+            // LDS write i + 1
+            if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
+            {
+                auto b_shuffle_tmp_loop = make_static_distributed_tensor<BDataType>(
+                    Policy::template MakeShuffledBRegBlockDescriptor<Problem>());
+                shuffle_tile(b_shuffle_tmp_loop, b_block_tile);
+                store_tile(b_copy_lds_window,
+                           tile_elementwise_in(b_element_func, b_shuffle_tmp_loop));
+            }
+            else
+            {
+                const auto b_block_tile_tmp = tile_elementwise_in(b_element_func, b_block_tile);
+                store_tile(b_copy_lds_window, b_block_tile_tmp);
+            }
+
+            iCounter--;
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            // GEMM num_loop - 1
+            block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+        }
+
+        return c_block_tile;
+#endif
+    }
+
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                   const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                   index_t num_loop,
+                                   void* p_smem
+#if FEIFEI_DEBUG
+                                   ,
+                                   const BDataType* b_ptr,
+                                   int* dbg_int,
+                                   float* dbg_fp32,
+                                   short* dbg_f168
+#endif
+    ) const
+    {
+        return operator()(
+            a_dram_block_window_tmp,
+            [](const ADataType & a) { return a; },
+            b_dram_block_window_tmp,
+            [](const BDataType & b) { return b; },
+            num_loop,
+            p_smem
+#if FEIFEI_DEBUG
+            ,
+            b_ptr,
+            dbg_int,
+            dbg_fp32,
+            dbg_f168
+#endif
+        );
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+
+namespace ck_tile {
+
+// Default policy for GemmPipelineAGmemBGmemCRegV1
+// Default policy class should not be templated, put template on member functions instead
+struct FlatmmPipelineAGmemBGmemCRegV1DefaultPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    static constexpr bool TransposeC = true;
+
+#if 0
+    // 2d
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr auto a_lds_block_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(kMPerBlock, kKPerBlock), number<32>{});
+
+        return a_lds_block_desc;
+    }
+
+    // 2d
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr auto b_lds_block_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(kNPerBlock, kKPerBlock), number<32>{});
+
+        return b_lds_block_desc;
+    }
+#elif 1
+    // 3d + padding
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        // TODO: this 8 is AK1! should be a policy parameter!
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / 8>{}, number<kMPerBlock>{}, number<8>{}),
+            make_tuple(number<(kMPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
+            number<8>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_pass_through_transform(kMPerBlock),
+                       make_merge_transform(make_tuple(kKPerBlock / 8, 8))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+    }
+
+    // 3d + padding
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kKPerBlock / 8>{}, number<kNPerBlock>{}, number<8>{}),
+            make_tuple(number<(kNPerBlock + 1) * 8>{}, number<8>{}, number<1>{}),
+            number<8>{},
+            number<1>{});
+
+        constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+            b_lds_block_desc_0,
+            make_tuple(make_pass_through_transform(kNPerBlock),
+                       make_merge_transform(make_tuple(kKPerBlock / 8, 8))),
+            make_tuple(sequence<1>{}, sequence<0, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return b_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
+                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
+    {
+        constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) *
+                                        MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_b;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
+        constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
+        constexpr index_t smem_size   = smem_size_a + smem_size_b;
+
+        return smem_size;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackA()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        return Problem::VectorLoadSize / sizeof(ADataType);
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemPackB()
+    {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        return Problem::VectorLoadSize / sizeof(BDataType);
+    }
+#elif 1
+    // fake XOR
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr auto a_lds_block_desc_d1_d2_d3 = make_naive_tensor_descriptor_packed(
+            make_tuple(number<kMPerBlock / 2>{}, number<2>{}, number<kKPerBlock>{}),
+            number<kKPerBlock>{});
+
+        constexpr index_t kK1 = 16 / sizeof(ADataType);
+
+        constexpr auto a_lds_block_desc_d4_d5_d6 = transform_tensor_descriptor(
+            a_lds_block_desc_d1_d2_d3,
+            make_tuple(
+                make_xor_transform(make_tuple(number<kMPerBlock / 2>{}, number<kKPerBlock>{}), kK1),
+                make_pass_through_transform(2)),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}));
+
+        constexpr auto a_lds_block_desc_m_k = transform_tensor_descriptor(
+            a_lds_block_desc_d4_d5_d6,
+            make_tuple(make_merge_transform(make_tuple(number<kMPerBlock / 2>{}, number<2>{})),
+                       make_pass_through_transform(kKPerBlock)),
+            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc_m_k;
+    }
+
+    // fake XOR
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+        using namespace ck_tile;
+
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+
+        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr auto b_lds_block_desc_d1_d2_d3 = make_naive_tensor_descriptor_packed(
+            make_tuple(number<kNPerBlock / 2>{}, number<2>{}, number<kKPerBlock>{}),
+            number<kKPerBlock>{});
+
+        constexpr index_t kK1 = 16 / sizeof(BDataType);
+
+        constexpr auto b_lds_block_desc_d4_d5_d6 = transform_tensor_descriptor(
+            b_lds_block_desc_d1_d2_d3,
+            make_tuple(
+                make_xor_transform(make_tuple(number<kNPerBlock / 2>{}, number<kKPerBlock>{}), kK1),
+                make_pass_through_transform(2)),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}));
+
+        constexpr auto b_lds_block_desc_n_k = transform_tensor_descriptor(
+            b_lds_block_desc_d4_d5_d6,
+            make_tuple(make_merge_transform(make_tuple(number<kNPerBlock / 2>{}, number<2>{})),
+                       make_pass_through_transform(kKPerBlock)),
+            make_tuple(sequence<0, 1>{}, sequence<2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return b_lds_block_desc_n_k;
+    }
+#endif
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M0           = MPerBlock / M1;
+            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % M1 == 0);
+            constexpr index_t K3    = total_pixels / M1;
+            constexpr index_t KPack = GetSmemPackA<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * M0))
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * M0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * M0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = 16 / sizeof(ADataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t M2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (M2 * K0) == 0)
+            {
+                constexpr index_t M1 = BlockSize / get_warp_size();
+                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+                constexpr index_t M0 = MPerBlock / (M2 * M1);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M2, M1 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            else
+            {
+                constexpr index_t M0 = BlockSize / get_warp_size();
+                constexpr index_t M1 = MPerBlock / (M2 * M0);
+                static_assert(M0 * M1 * M2 == MPerBlock,
+                              "Incorrect M0, M1, M2 configuration! "
+                              "M0, M1, M2 must cover whole MPerBlock!");
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
+    {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t N0           = NPerBlock / N1;
+            constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % N1 == 0);
+            constexpr index_t K3    = total_pixels / N1;
+            constexpr index_t KPack = GetSmemPackB<Problem>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * N0) == 0)
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * N0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * N0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t N2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (N2 * K0) == 0)
+            {
+                constexpr index_t N1 = BlockSize / get_warp_size();
+                static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
+                static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
+                constexpr index_t N0 = NPerBlock / (N2 * N1);
+                static_assert(N0 * N1 * N2 == NPerBlock,
+                              "Incorrect N0, N1, N2 configuration! "
+                              "N0, N1, N2 must cover whole NPerBlock!");
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            // coalesce reading for each warps
+            else
+            {
+                constexpr index_t N0 = BlockSize / get_warp_size();
+                constexpr index_t N1 = NPerBlock / (N2 * N0);
+                static_assert(N0 * N1 * N2 == NPerBlock,
+                              "Incorrect N0, N1, N2 configuration! "
+                              "N0, N1, N2 must cover whole NPerBlock!");
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBRegBlockDescriptor()
+    {
+        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        static_assert(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+        constexpr index_t N0           = kNPerBlock / N1;
+        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
+        static_assert(total_pixels % N1 == 0);
+        constexpr index_t K3     = total_pixels / N1;
+        constexpr index_t kKPack = GetSmemPackB<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * N0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * N0);
+            constexpr index_t K0 = kBlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * N0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDescriptor()
+    {
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::RowMajor>);
+        constexpr index_t kBlockSize = Problem::kBlockSize;
+        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t M0           = kMPerBlock / M1;
+        constexpr index_t total_pixels = kMPerBlock * kKPerBlock / kBlockSize;
+        static_assert(total_pixels % M1 == 0);
+        constexpr index_t K3     = total_pixels / M1;
+        constexpr index_t kKPack = GetSmemPackA<Problem>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * M0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * M0);
+            constexpr index_t K0 = kBlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * M0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = kBlockSize / get_warp_size() / K1;
+            static_assert(kKPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
+    {
+        using AccDataType     = float;
+        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                       typename Problem::BDataType,
+                                                       AccDataType,
+                                                       WarpTile::at(I0),
+                                                       WarpTile::at(I1),
+                                                       WarpTile::at(I2),
+                                                       TransposeC>;
+        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                                      typename Problem::BDataType,
+                                                                      typename Problem::CDataType,
+                                                                      BlockWarps,
+                                                                      WarpGemm>;
+
+        return BlockUniversalGemmAsBsCr<Problem, BlockGemmPolicy>{};
+    }
+};
+
+} // namespace ck_tile
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_universal_pipeline_ag_bg_cr_policy.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
+
+namespace ck_tile {
+
+// UniversalGemm Policy
+struct UniversalFlatmmPipelineAgBgCrPolicy
+{
+    static constexpr auto I0 = number<0>{};
+    static constexpr auto I1 = number<1>{};
+    static constexpr auto I2 = number<2>{};
+
+    static constexpr bool TransposeC = true;
+
+    template <typename Problem, typename DataType, index_t MNPerBlock>
+    CK_TILE_HOST_DEVICE static constexpr auto GetVectorLoadSize()
+    {
+        constexpr index_t BlockSize           = Problem::kBlockSize;
+        constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
+        constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
+
+        if constexpr(elements_per_thread % (16 / sizeof(DataType)) == 0)
+        {
+            return (16 / sizeof(DataType));
+        }
+        else if constexpr(elements_per_thread % (8 / sizeof(DataType)) == 0)
+        {
+            return (8 / sizeof(DataType));
+        }
+        else if constexpr(elements_per_thread % (4 / sizeof(DataType)) == 0 &&
+                          sizeof(DataType) >= 4)
+        {
+            return (4 / sizeof(DataType));
+        }
+        else if constexpr(elements_per_thread % (2 / sizeof(DataType)) == 0 &&
+                          sizeof(DataType) >= 2)
+        {
+            return (2 / sizeof(DataType));
+        }
+        else
+        {
+            return 1;
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetVectorLoadSize<Problem, ADataType, MPerBlock>();
+
+        constexpr auto DataTypeSize = sizeof(ADataType);
+        constexpr auto MLdsLayer =
+            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<KPerBlock / KPack * MLdsLayer>{},
+                       number<MPerBlock / MLdsLayer>{},
+                       number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock * MLdsLayer>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<MPerBlock / MLdsLayer>{},
+                                                     number<KPerBlock / KPack * MLdsLayer>{})),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto a_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            a_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                           make_tuple(number<KPerBlock / KPack>{}, number<MLdsLayer>{})),
+                       make_pass_through_transform(number<MPerBlock / MLdsLayer>{}),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<MPerBlock / MLdsLayer>{}, number<MLdsLayer>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1, 2>{}, sequence<0, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return a_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+        constexpr index_t KPack     = GetVectorLoadSize<Problem, BDataType, NPerBlock>();
+
+        constexpr auto DataTypeSize = sizeof(BDataType);
+        constexpr auto NLdsLayer =
+            (32 * 4 / KPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / KPerBlock / DataTypeSize);
+
+        constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<KPerBlock / KPack * NLdsLayer>{},
+                       number<NPerBlock / NLdsLayer>{},
+                       number<KPack>{}),
+            make_tuple(number<KPack>{}, number<KPerBlock * NLdsLayer>{}, number<1>{}),
+            number<KPack>{},
+            number<1>{});
+
+        constexpr auto b_lds_block_desc_permuted = transform_tensor_descriptor(
+            b_lds_block_desc_0,
+            make_tuple(make_xor_transform(make_tuple(number<NPerBlock / NLdsLayer>{},
+                                                     number<KPerBlock / KPack * NLdsLayer>{})),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}),
+            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+
+        constexpr auto b_lds_block_desc_xk0_mnldslayer_mn_xk1 = transform_tensor_descriptor(
+            b_lds_block_desc_permuted,
+            make_tuple(make_unmerge_transform(
+                           make_tuple(number<KPerBlock / KPack>{}, number<NLdsLayer>{})),
+                       make_pass_through_transform(number<NPerBlock / NLdsLayer>{}),
+                       make_pass_through_transform(number<KPack>{})),
+            make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}),
+            make_tuple(sequence<0, 2>{}, sequence<1>{}, sequence<3>{}));
+
+        constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+            b_lds_block_desc_xk0_mnldslayer_mn_xk1,
+            make_tuple(make_merge_transform_v3_division_mod(
+                           make_tuple(number<NPerBlock / NLdsLayer>{}, number<NLdsLayer>{})),
+                       make_merge_transform_v3_division_mod(
+                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+            make_tuple(sequence<1, 2>{}, sequence<0, 3>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        return b_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
+    {
+        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
+                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_a;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
+    {
+        constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) *
+                                        MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+        return smem_size_b;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
+    {
+        constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
+        constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
+        index_t smem_size             = 0;
+        smem_size += smem_size_a + smem_size_b;
+
+        return smem_size;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t M0           = MPerBlock / M1;
+            constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % M1 == 0);
+            constexpr index_t K3    = total_pixels / M1;
+            constexpr index_t KPack = GetVectorLoadSize<Problem, ADataType, MPerBlock>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * M0) == 0)
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * M0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * M0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(ADataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t M2 = get_warp_size() / K0;
+            if constexpr(get_warp_size() % (M2 * K0) == 0)
+            {
+                constexpr index_t M1 = BlockSize / get_warp_size();
+                static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
+                static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
+                constexpr index_t M0 = MPerBlock / (M2 * M1);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            else
+            {
+                constexpr index_t M0 = BlockSize / get_warp_size();
+                constexpr index_t M1 = MPerBlock / (M2 * M0);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
+    {
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+
+        constexpr index_t BlockSize = Problem::kBlockSize;
+
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
+        {
+            constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t N0           = NPerBlock / N1;
+            constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
+            static_assert(total_pixels % N1 == 0);
+            constexpr index_t K3    = total_pixels / N1;
+            constexpr index_t KPack = GetVectorLoadSize<Problem, BDataType, NPerBlock>();
+            static_assert(KPack % K3 == 0);
+            constexpr index_t K2 = KPack / K3;
+            if constexpr(get_warp_size() % (K2 * N0) == 0)
+            {
+                constexpr index_t K1 = get_warp_size() / (K2 * N0);
+                constexpr index_t K0 = BlockSize / get_warp_size();
+                static_assert(KPerBlock == K0 * K1 * K2 * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                               tuple<sequence<2>, sequence<2, 1, 2>>,
+                                               tuple<sequence<0>, sequence<1, 0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+            else
+            {
+                constexpr index_t K1   = (K2 * N0) / get_warp_size();
+                constexpr index_t K2_m = K2 / K1;
+                constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+                static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                               tuple<sequence<2, 2>, sequence<1, 2>>,
+                                               tuple<sequence<0, 1>, sequence<0, 2>>,
+                                               sequence<2, 1>,
+                                               sequence<3, 1>>{});
+            }
+        }
+        else
+        {
+
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(BDataType);
+            constexpr index_t K0 = KPerBlock / K1;
+            constexpr index_t N2 = get_warp_size() / K0;
+            // coalesce reading for each blocks
+            if constexpr(get_warp_size() % (N2 * K0) == 0)
+            {
+                constexpr index_t N1 = BlockSize / get_warp_size();
+                static_assert(N2 != 0, "N2 is zero, which will lead to a division by zero error.");
+                static_assert(N1 != 0, "N1 is zero, which will lead to a division by zero error.");
+                constexpr index_t N0 = NPerBlock / (N2 * N1);
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<1>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<0, 1>>{});
+            }
+            // coalesce reading for each warps
+            else
+            {
+                constexpr index_t N0 = BlockSize / get_warp_size();
+                constexpr index_t N1 = NPerBlock / (N2 * N0);
+
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<sequence<1>,
+                                               tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                               tuple<sequence<1>, sequence<1, 2>>,
+                                               tuple<sequence<0>, sequence<2, 0>>,
+                                               sequence<1, 2>,
+                                               sequence<1, 1>>{});
+            }
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledARegBlockDescriptor()
+    {
+        using ALayout   = remove_cvref_t<typename Problem::ALayout>;
+        using ADataType = remove_cvref_t<typename Problem::ADataType>;
+        static_assert(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>);
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t MPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t M1           = Problem::VectorLoadSize / sizeof(ADataType);
+        constexpr index_t M0           = MPerBlock / M1;
+        constexpr index_t total_pixels = MPerBlock * KPerBlock / BlockSize;
+        static_assert(total_pixels % M1 == 0);
+        constexpr index_t K3     = total_pixels / M1;
+        constexpr index_t kKPack = GetVectorLoadSize<Problem, ADataType, MPerBlock>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * M0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * M0);
+            constexpr index_t K0 = BlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * M0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+            static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<M0, M1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeShuffledBRegBlockDescriptor()
+    {
+        using BLayout   = remove_cvref_t<typename Problem::BLayout>;
+        using BDataType = remove_cvref_t<typename Problem::BDataType>;
+        static_assert(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>);
+        constexpr index_t BlockSize = Problem::kBlockSize;
+        constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
+        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+
+        constexpr index_t N1           = Problem::VectorLoadSize / sizeof(BDataType);
+        constexpr index_t N0           = NPerBlock / N1;
+        constexpr index_t total_pixels = NPerBlock * KPerBlock / BlockSize;
+        static_assert(total_pixels % N1 == 0);
+        constexpr index_t K3     = total_pixels / N1;
+        constexpr index_t kKPack = GetVectorLoadSize<Problem, BDataType, NPerBlock>();
+        static_assert(kKPack % K3 == 0);
+        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
+        constexpr index_t warp_size = get_warp_size();
+        if constexpr(warp_size % (K2 * N0) == 0)
+        {
+            constexpr index_t K1 = warp_size / (K2 * N0);
+            constexpr index_t K0 = BlockSize / warp_size;
+
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2, K3>>,
+                                           tuple<sequence<2>, sequence<2, 1, 2>>,
+                                           tuple<sequence<0>, sequence<1, 0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+        else
+        {
+            constexpr index_t K1   = (K2 * N0) / get_warp_size();
+            constexpr index_t K2_m = K2 / K1;
+            constexpr index_t K0   = BlockSize / get_warp_size() / K1;
+            static_assert(KPerBlock == K0 * K1 * K2_m * K3);
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0, N1>, sequence<K0, K1, K2_m, K3>>,
+                                           tuple<sequence<2, 2>, sequence<1, 2>>,
+                                           tuple<sequence<0, 1>, sequence<0, 2>>,
+                                           sequence<1, 2>,
+                                           sequence<1, 3>>{});
+        }
+    }
+
+    CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC() { return TransposeC; }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
+    {
+        using AccDataType     = float;
+        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
+        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
+                                                       typename Problem::BDataType,
+                                                       AccDataType,
+                                                       WarpTile::at(I0),
+                                                       WarpTile::at(I1),
+                                                       WarpTile::at(I2),
+                                                       TransposeC>;
+        using BlockGemmPolicy = BlockGemmASmemBSmemCRegV1CustomPolicy<typename Problem::ADataType,
+                                                                      typename Problem::BDataType,
+                                                                      typename Problem::CDataType,
+                                                                      BlockWarps,
+                                                                      WarpGemm>;
+        return BlockGemmASmemBSmemCRegV1<Problem, BlockGemmPolicy>{};
+    }
+};
+
+} // namespace ck_tile